mirror of https://github.com/facebook/rocksdb.git
Merge branch 'main' into JniReaderForTableIterator
This commit is contained in:
commit
21eca90d5e
|
@ -1,13 +1,13 @@
|
|||
name: facebook/rocksdb/benchmark-linux
|
||||
on: workflow_dispatch
|
||||
jobs:
|
||||
# FIXME: when this job is fixed, it should be given a cron schedule like
|
||||
permissions: {}
|
||||
# FIXME: Disabled temporarily
|
||||
# schedule:
|
||||
# - cron: 0 * * * *
|
||||
# workflow_dispatch:
|
||||
# - cron: 7 */2 * * * # At minute 7 past every 2nd hour
|
||||
jobs:
|
||||
benchmark-linux:
|
||||
if: ${{ github.repository_owner == 'facebook' }}
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: ubuntu-latest # FIXME: change this back to self-hosted when ready
|
||||
steps:
|
||||
- uses: actions/checkout@v4.1.0
|
||||
- uses: "./.github/actions/build-for-benchmarks"
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
name: facebook/rocksdb/nightly
|
||||
on: workflow_dispatch
|
||||
permissions: {}
|
||||
jobs:
|
||||
# These jobs would be in nightly but are failing or otherwise broken for
|
||||
# some reason.
|
||||
|
|
|
@ -3,6 +3,7 @@ on:
|
|||
schedule:
|
||||
- cron: 0 9 * * *
|
||||
workflow_dispatch:
|
||||
permissions: {}
|
||||
jobs:
|
||||
build-format-compatible:
|
||||
if: ${{ github.repository_owner == 'facebook' }}
|
||||
|
@ -59,12 +60,15 @@ jobs:
|
|||
container:
|
||||
image: zjay437/rocksdb:0.6
|
||||
options: --shm-size=16gb
|
||||
env:
|
||||
CC: clang-13
|
||||
CXX: clang++-13
|
||||
steps:
|
||||
- uses: actions/checkout@v4.1.0
|
||||
- uses: "./.github/actions/pre-steps"
|
||||
- uses: "./.github/actions/setup-folly"
|
||||
- uses: "./.github/actions/build-folly"
|
||||
- run: CC=clang-13 CXX=clang++-13 LIB_MODE=static USE_CLANG=1 USE_FOLLY=1 COMPILE_WITH_UBSAN=1 COMPILE_WITH_ASAN=1 make -j32 check
|
||||
- run: LIB_MODE=static USE_CLANG=1 USE_FOLLY=1 COMPILE_WITH_UBSAN=1 COMPILE_WITH_ASAN=1 make -j32 check
|
||||
- uses: "./.github/actions/post-steps"
|
||||
build-linux-valgrind:
|
||||
if: ${{ github.repository_owner == 'facebook' }}
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
name: facebook/rocksdb/pr-jobs-candidate
|
||||
on: workflow_dispatch
|
||||
permissions: {}
|
||||
jobs:
|
||||
# These jobs would be in pr-jobs but are failing or otherwise broken for
|
||||
# some reason.
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
name: facebook/rocksdb/pr-jobs
|
||||
on: [push, pull_request]
|
||||
permissions: {}
|
||||
jobs:
|
||||
# NOTE: multiple workflows would be recommended, but the current GHA UI in
|
||||
# PRs doesn't make it clear when there's an overall error with a workflow,
|
||||
|
|
22
HISTORY.md
22
HISTORY.md
|
@ -1,6 +1,28 @@
|
|||
# Rocksdb Change Log
|
||||
> NOTE: Entries for next release do not go here. Follow instructions in `unreleased_history/README.txt`
|
||||
|
||||
## 9.6.0 (08/19/2024)
|
||||
### New Features
|
||||
* *Best efforts recovery supports recovering to incomplete Version with a clean seqno cut that presents a valid point in time view from the user's perspective, if versioning history doesn't include atomic flush.
|
||||
* New option `BlockBasedTableOptions::decouple_partitioned_filters` should improve efficiency in serving read queries because filter and index partitions can consistently target the configured `metadata_block_size`. This option is currently opt-in.
|
||||
* Introduce a new mutable CF option `paranoid_memory_checks`. It enables additional validation on data integrity during reads/scanning. Currently, skip list based memtable will validate key ordering during look up and scans.
|
||||
|
||||
### Public API Changes
|
||||
* Add ticker stats to count file read retries due to checksum mismatch
|
||||
* Adds optional installation callback function for remote compaction
|
||||
|
||||
### Behavior Changes
|
||||
* There may be less intra-L0 compaction triggered by total L0 size being too small. We now use compensated file size (tombstones are assigned some value size) when calculating L0 size and reduce the threshold for L0 size limit. This is to avoid accumulating too much data/tombstones in L0.
|
||||
|
||||
### Bug Fixes
|
||||
* *Make DestroyDB supports slow deletion when it's configured in `SstFileManager`. The slow deletion is subject to the configured `rate_bytes_per_sec`, but not subject to the `max_trash_db_ratio`.
|
||||
* Fixed a bug where we set unprep_seqs_ even when WriteImpl() fails. This was caught by stress test write fault injection in WriteImpl(). This may have incorrectly caused iteration creation failure for unvalidated writes or returned wrong result for WriteUnpreparedTxn::GetUnpreparedSequenceNumbers().
|
||||
* Fixed a bug where successful write right after error recovery for last failed write finishes causes duplicate WAL entries
|
||||
* Fixed a data race involving the background error status in `unordered_write` mode.
|
||||
* *Fix a bug where file snapshot functions like backup, checkpoint may attempt to copy a non-existing manifest file. #12882
|
||||
* Fix a bug where per kv checksum corruption may be ignored in MultiGet().
|
||||
* Fix a race condition in pessimistic transactions that could allow multiple transactions with the same name to be registered simultaneously, resulting in a crash or other unpredictable behavior.
|
||||
|
||||
## 9.5.0 (07/19/2024)
|
||||
### Public API Changes
|
||||
* Introduced new C API function rocksdb_writebatch_iterate_cf for column family-aware iteration over the contents of a WriteBatch
|
||||
|
|
|
@ -1652,6 +1652,9 @@ bool ColumnFamilyData::ShouldPostponeFlushToRetainUDT(
|
|||
}
|
||||
for (const Slice& table_newest_udt :
|
||||
imm()->GetTablesNewestUDT(max_memtable_id)) {
|
||||
if (table_newest_udt.empty()) {
|
||||
continue;
|
||||
}
|
||||
assert(table_newest_udt.size() == full_history_ts_low.size());
|
||||
// Checking the newest UDT contained in MemTable with ascending ID up to
|
||||
// `max_memtable_id`. Return immediately on finding the first MemTable that
|
||||
|
|
|
@ -3067,12 +3067,20 @@ TEST_P(ColumnFamilyTest, CompactionSpeedupForMarkedFiles) {
|
|||
WaitForCompaction();
|
||||
AssertFilesPerLevel("0,1", 0 /* cf */);
|
||||
|
||||
// We should calculate the limit by obtaining the number of env background
|
||||
// threads, because the current test case will share the same env
|
||||
// with another case that may have already increased the number of
|
||||
// background threads which is larger than kParallelismLimit
|
||||
const auto limit = env_->GetBackgroundThreads(Env::Priority::LOW);
|
||||
|
||||
// Block the compaction thread pool so marked files accumulate in L0.
|
||||
test::SleepingBackgroundTask sleeping_tasks[kParallelismLimit];
|
||||
for (int i = 0; i < kParallelismLimit; i++) {
|
||||
std::vector<std::shared_ptr<test::SleepingBackgroundTask>> sleeping_tasks;
|
||||
for (int i = 0; i < limit; i++) {
|
||||
sleeping_tasks.emplace_back(
|
||||
std::make_shared<test::SleepingBackgroundTask>());
|
||||
env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
|
||||
&sleeping_tasks[i], Env::Priority::LOW);
|
||||
sleeping_tasks[i].WaitUntilSleeping();
|
||||
sleeping_tasks[i].get(), Env::Priority::LOW);
|
||||
sleeping_tasks[i]->WaitUntilSleeping();
|
||||
}
|
||||
|
||||
// Zero marked upper-level files. No speedup.
|
||||
|
@ -3091,9 +3099,9 @@ TEST_P(ColumnFamilyTest, CompactionSpeedupForMarkedFiles) {
|
|||
ASSERT_EQ(kParallelismLimit, dbfull()->TEST_BGCompactionsAllowed());
|
||||
AssertFilesPerLevel("2,1", 0 /* cf */);
|
||||
|
||||
for (int i = 0; i < kParallelismLimit; i++) {
|
||||
sleeping_tasks[i].WakeUp();
|
||||
sleeping_tasks[i].WaitUntilDone();
|
||||
for (int i = 0; i < limit; i++) {
|
||||
sleeping_tasks[i]->WakeUp();
|
||||
sleeping_tasks[i]->WaitUntilDone();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -552,7 +552,8 @@ class CompactionJobTestBase : public testing::Test {
|
|||
/*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"",
|
||||
/*error_handler=*/nullptr, /*read_only=*/false));
|
||||
compaction_job_stats_.Reset();
|
||||
ASSERT_OK(SetIdentityFile(WriteOptions(), env_, dbname_));
|
||||
ASSERT_OK(
|
||||
SetIdentityFile(WriteOptions(), env_, dbname_, Temperature::kUnknown));
|
||||
|
||||
VersionEdit new_db;
|
||||
new_db.SetLogNumber(0);
|
||||
|
@ -575,7 +576,8 @@ class CompactionJobTestBase : public testing::Test {
|
|||
}
|
||||
ASSERT_OK(s);
|
||||
// Make "CURRENT" file that points to the new manifest file.
|
||||
s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr);
|
||||
s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1,
|
||||
Temperature::kUnknown, nullptr);
|
||||
|
||||
ASSERT_OK(s);
|
||||
|
||||
|
|
|
@ -925,11 +925,15 @@ bool LevelCompactionBuilder::PickSizeBasedIntraL0Compaction() {
|
|||
}
|
||||
uint64_t l0_size = 0;
|
||||
for (const auto& file : l0_files) {
|
||||
l0_size += file->fd.GetFileSize();
|
||||
assert(file->compensated_file_size >= file->fd.GetFileSize());
|
||||
// Compact down L0s with more deletions.
|
||||
l0_size += file->compensated_file_size;
|
||||
}
|
||||
const uint64_t min_lbase_size =
|
||||
l0_size * static_cast<uint64_t>(std::max(
|
||||
10.0, mutable_cf_options_.max_bytes_for_level_multiplier));
|
||||
|
||||
// Avoid L0->Lbase compactions that are inefficient for write-amp.
|
||||
const double kMultiplier =
|
||||
std::max(10.0, mutable_cf_options_.max_bytes_for_level_multiplier) * 2;
|
||||
const uint64_t min_lbase_size = MultiplyCheckOverflow(l0_size, kMultiplier);
|
||||
assert(min_lbase_size >= l0_size);
|
||||
const std::vector<FileMetaData*>& lbase_files =
|
||||
vstorage_->LevelFiles(/*level=*/base_level);
|
||||
|
|
|
@ -214,7 +214,10 @@ class CompactionPickerTest : public CompactionPickerTestBase {
|
|||
explicit CompactionPickerTest()
|
||||
: CompactionPickerTestBase(BytewiseComparator()) {}
|
||||
|
||||
~CompactionPickerTest() override = default;
|
||||
~CompactionPickerTest() override {
|
||||
SyncPoint::GetInstance()->ClearAllCallBacks();
|
||||
SyncPoint::GetInstance()->DisableProcessing();
|
||||
}
|
||||
};
|
||||
|
||||
class CompactionPickerU64TsTest : public CompactionPickerTestBase {
|
||||
|
@ -4284,27 +4287,28 @@ TEST_F(CompactionPickerTest, IntraL0WhenL0IsSmall) {
|
|||
SCOPED_TRACE("lbase_size_multiplier=" +
|
||||
std::to_string(lbase_size_multiplier));
|
||||
NewVersionStorage(6, kCompactionStyleLevel);
|
||||
// When L0 size is <= Lbase size / max_bytes_for_level_multiplier,
|
||||
// When L0 size is <= Lbase size / max_bytes_for_level_multiplier / 2,
|
||||
// intra-L0 compaction is picked. Otherwise, L0->L1
|
||||
// compaction is picked.
|
||||
// compensated_file_size will be used to compute total l0 size.
|
||||
Add(/*level=*/0, /*file_number=*/1U, /*smallest=*/"100",
|
||||
/*largest=*/"200", /*file_size=*/1000, /*path_id=*/0,
|
||||
/*largest=*/"200", /*file_size=*/10, /*path_id=*/0,
|
||||
/*smallest_seq=*/10, /*largest_seq=*/11,
|
||||
/*compensated_file_size=*/1000);
|
||||
Add(/*level=*/0, /*file_number=*/2U, /*smallest=*/"100",
|
||||
/*largest=*/"100", /*file_size=*/1000, /*path_id=*/0,
|
||||
/*largest=*/"100", /*file_size=*/10, /*path_id=*/0,
|
||||
/*smallest_seq=*/20, /*largest_seq=*/21,
|
||||
/*compensated_file_size=*/1000);
|
||||
Add(/*level=*/0, /*file_number=*/3U, /*smallest=*/"100",
|
||||
/*largest=*/"200", /*file_size=*/1000, /*path_id=*/0,
|
||||
/*largest=*/"200", /*file_size=*/10, /*path_id=*/0,
|
||||
/*smallest_seq=*/30, /*largest_seq=*/31,
|
||||
/*compensated_file_size=*/1000);
|
||||
Add(/*level=*/0, /*file_number=*/4U, /*smallest=*/"100",
|
||||
/*largest=*/"200", /*file_size=*/1000, /*path_id=*/0,
|
||||
/*largest=*/"200", /*file_size=*/10, /*path_id=*/0,
|
||||
/*smallest_seq=*/40, /*largest_seq=*/41,
|
||||
/*compensated_file_size=*/1000);
|
||||
const uint64_t l0_size = 4000;
|
||||
const uint64_t lbase_size = l0_size * lbase_size_multiplier;
|
||||
const uint64_t lbase_size = l0_size * lbase_size_multiplier * 2;
|
||||
Add(/*level=*/1, /*file_number=*/5U, /*smallest=*/"100",
|
||||
/*largest=*/"200", /*file_size=*/lbase_size, /*path_id=*/0,
|
||||
/*smallest_seq=*/0, /*largest_seq=*/0,
|
||||
|
|
|
@ -140,9 +140,13 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
|
|||
return compaction_status;
|
||||
}
|
||||
|
||||
// CompactionServiceJobStatus::kSuccess was returned, but somehow we failed to
|
||||
// read the result. Consider this as an installation failure
|
||||
if (!s.ok()) {
|
||||
sub_compact->status = s;
|
||||
compaction_result.status.PermitUncheckedError();
|
||||
db_options_.compaction_service->OnInstallation(
|
||||
response.scheduled_job_id, CompactionServiceJobStatus::kFailure);
|
||||
return CompactionServiceJobStatus::kFailure;
|
||||
}
|
||||
sub_compact->status = compaction_result.status;
|
||||
|
@ -154,18 +158,14 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
|
|||
is_first_one = false;
|
||||
}
|
||||
|
||||
ROCKS_LOG_INFO(db_options_.info_log,
|
||||
"[%s] [JOB %d] Receive remote compaction result, output path: "
|
||||
"%s, files: %s",
|
||||
compaction_input.column_family.name.c_str(), job_id_,
|
||||
compaction_result.output_path.c_str(),
|
||||
output_files_oss.str().c_str());
|
||||
|
||||
if (!s.ok()) {
|
||||
sub_compact->status = s;
|
||||
return CompactionServiceJobStatus::kFailure;
|
||||
}
|
||||
ROCKS_LOG_INFO(
|
||||
db_options_.info_log,
|
||||
"[%s] [JOB %d] Received remote compaction result, output path: "
|
||||
"%s, files: %s",
|
||||
compaction_input.column_family.name.c_str(), job_id_,
|
||||
compaction_result.output_path.c_str(), output_files_oss.str().c_str());
|
||||
|
||||
// Installation Starts
|
||||
for (const auto& file : compaction_result.output_files) {
|
||||
uint64_t file_num = versions_->NewFileNumber();
|
||||
auto src_file = compaction_result.output_path + "/" + file.file_name;
|
||||
|
@ -174,6 +174,8 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
|
|||
s = fs_->RenameFile(src_file, tgt_file, IOOptions(), nullptr);
|
||||
if (!s.ok()) {
|
||||
sub_compact->status = s;
|
||||
db_options_.compaction_service->OnInstallation(
|
||||
response.scheduled_job_id, CompactionServiceJobStatus::kFailure);
|
||||
return CompactionServiceJobStatus::kFailure;
|
||||
}
|
||||
|
||||
|
@ -182,6 +184,8 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
|
|||
s = fs_->GetFileSize(tgt_file, IOOptions(), &file_size, nullptr);
|
||||
if (!s.ok()) {
|
||||
sub_compact->status = s;
|
||||
db_options_.compaction_service->OnInstallation(
|
||||
response.scheduled_job_id, CompactionServiceJobStatus::kFailure);
|
||||
return CompactionServiceJobStatus::kFailure;
|
||||
}
|
||||
meta.fd = FileDescriptor(file_num, compaction->output_path_id(), file_size,
|
||||
|
@ -206,6 +210,8 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService(
|
|||
RecordTick(stats_, REMOTE_COMPACT_READ_BYTES, compaction_result.bytes_read);
|
||||
RecordTick(stats_, REMOTE_COMPACT_WRITE_BYTES,
|
||||
compaction_result.bytes_written);
|
||||
db_options_.compaction_service->OnInstallation(
|
||||
response.scheduled_job_id, CompactionServiceJobStatus::kSuccess);
|
||||
return CompactionServiceJobStatus::kSuccess;
|
||||
}
|
||||
|
||||
|
|
|
@ -108,6 +108,11 @@ class MyTestCompactionService : public CompactionService {
|
|||
}
|
||||
}
|
||||
|
||||
void OnInstallation(const std::string& /*scheduled_job_id*/,
|
||||
CompactionServiceJobStatus status) override {
|
||||
final_updated_status_ = status;
|
||||
}
|
||||
|
||||
int GetCompactionNum() { return compaction_num_.load(); }
|
||||
|
||||
CompactionServiceJobInfo GetCompactionInfoForStart() { return start_info_; }
|
||||
|
@ -136,6 +141,10 @@ class MyTestCompactionService : public CompactionService {
|
|||
|
||||
void SetCanceled(bool canceled) { canceled_ = canceled; }
|
||||
|
||||
CompactionServiceJobStatus GetFinalCompactionServiceJobStatus() {
|
||||
return final_updated_status_.load();
|
||||
}
|
||||
|
||||
private:
|
||||
InstrumentedMutex mutex_;
|
||||
std::atomic_int compaction_num_{0};
|
||||
|
@ -158,6 +167,8 @@ class MyTestCompactionService : public CompactionService {
|
|||
std::vector<std::shared_ptr<TablePropertiesCollectorFactory>>
|
||||
table_properties_collector_factories_;
|
||||
std::atomic_bool canceled_{false};
|
||||
std::atomic<CompactionServiceJobStatus> final_updated_status_{
|
||||
CompactionServiceJobStatus::kUseLocal};
|
||||
};
|
||||
|
||||
class CompactionServiceTest : public DBTestBase {
|
||||
|
@ -255,6 +266,8 @@ TEST_F(CompactionServiceTest, BasicCompactions) {
|
|||
|
||||
auto my_cs = GetCompactionService();
|
||||
ASSERT_GE(my_cs->GetCompactionNum(), 1);
|
||||
ASSERT_EQ(CompactionServiceJobStatus::kSuccess,
|
||||
my_cs->GetFinalCompactionServiceJobStatus());
|
||||
|
||||
// make sure the compaction statistics is only recorded on the remote side
|
||||
ASSERT_GE(compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES), 1);
|
||||
|
@ -437,6 +450,8 @@ TEST_F(CompactionServiceTest, InvalidResult) {
|
|||
Slice end(end_str);
|
||||
Status s = db_->CompactRange(CompactRangeOptions(), &start, &end);
|
||||
ASSERT_FALSE(s.ok());
|
||||
ASSERT_EQ(CompactionServiceJobStatus::kFailure,
|
||||
my_cs->GetFinalCompactionServiceJobStatus());
|
||||
}
|
||||
|
||||
TEST_F(CompactionServiceTest, SubCompaction) {
|
||||
|
|
|
@ -3407,6 +3407,46 @@ class TableFileListener : public EventListener {
|
|||
InstrumentedMutex mutex_;
|
||||
std::unordered_map<std::string, std::vector<std::string>> cf_to_paths_;
|
||||
};
|
||||
|
||||
class FlushTableFileListener : public EventListener {
|
||||
public:
|
||||
void OnTableFileCreated(const TableFileCreationInfo& info) override {
|
||||
InstrumentedMutexLock lock(&mutex_);
|
||||
if (info.reason != TableFileCreationReason::kFlush) {
|
||||
return;
|
||||
}
|
||||
cf_to_flushed_files_[info.cf_name].push_back(info.file_path);
|
||||
}
|
||||
std::vector<std::string>& GetFlushedFiles(const std::string& cf_name) {
|
||||
InstrumentedMutexLock lock(&mutex_);
|
||||
return cf_to_flushed_files_[cf_name];
|
||||
}
|
||||
|
||||
private:
|
||||
InstrumentedMutex mutex_;
|
||||
std::unordered_map<std::string, std::vector<std::string>>
|
||||
cf_to_flushed_files_;
|
||||
};
|
||||
|
||||
class FlushBlobFileListener : public EventListener {
|
||||
public:
|
||||
void OnBlobFileCreated(const BlobFileCreationInfo& info) override {
|
||||
InstrumentedMutexLock lock(&mutex_);
|
||||
if (info.reason != BlobFileCreationReason::kFlush) {
|
||||
return;
|
||||
}
|
||||
cf_to_flushed_blobs_files_[info.cf_name].push_back(info.file_path);
|
||||
}
|
||||
std::vector<std::string>& GetFlushedBlobFiles(const std::string& cf_name) {
|
||||
InstrumentedMutexLock lock(&mutex_);
|
||||
return cf_to_flushed_blobs_files_[cf_name];
|
||||
}
|
||||
|
||||
private:
|
||||
InstrumentedMutex mutex_;
|
||||
std::unordered_map<std::string, std::vector<std::string>>
|
||||
cf_to_flushed_blobs_files_;
|
||||
};
|
||||
} // anonymous namespace
|
||||
|
||||
TEST_F(DBBasicTest, LastSstFileNotInManifest) {
|
||||
|
@ -3512,6 +3552,121 @@ TEST_F(DBBasicTest, RecoverWithMissingFiles) {
|
|||
}
|
||||
}
|
||||
|
||||
// Param 0: whether to enable blob DB.
|
||||
// Param 1: when blob DB is enabled, whether to also delete the missing L0
|
||||
// file's associated blob file.
|
||||
class BestEffortsRecoverIncompleteVersionTest
|
||||
: public DBTestBase,
|
||||
public testing::WithParamInterface<std::tuple<bool, bool>> {
|
||||
public:
|
||||
BestEffortsRecoverIncompleteVersionTest()
|
||||
: DBTestBase("best_efforts_recover_incomplete_version_test",
|
||||
/*env_do_fsync=*/false) {}
|
||||
};
|
||||
|
||||
TEST_P(BestEffortsRecoverIncompleteVersionTest, Basic) {
|
||||
Options options = CurrentOptions();
|
||||
options.enable_blob_files = std::get<0>(GetParam());
|
||||
bool delete_blob_file_too = std::get<1>(GetParam());
|
||||
DestroyAndReopen(options);
|
||||
FlushTableFileListener* flush_table_listener = new FlushTableFileListener();
|
||||
FlushBlobFileListener* flush_blob_listener = new FlushBlobFileListener();
|
||||
// Disable auto compaction to simplify SST file name tracking.
|
||||
options.disable_auto_compactions = true;
|
||||
options.listeners.emplace_back(flush_table_listener);
|
||||
options.listeners.emplace_back(flush_blob_listener);
|
||||
CreateAndReopenWithCF({"pikachu", "eevee"}, options);
|
||||
std::vector<std::string> all_cf_names = {kDefaultColumnFamilyName, "pikachu",
|
||||
"eevee"};
|
||||
int num_cfs = static_cast<int>(handles_.size());
|
||||
ASSERT_EQ(3, num_cfs);
|
||||
std::string start = "a";
|
||||
Slice start_slice = start;
|
||||
std::string end = "d";
|
||||
Slice end_slice = end;
|
||||
for (int cf = 0; cf != num_cfs; ++cf) {
|
||||
ASSERT_OK(Put(cf, "a", "a_value"));
|
||||
ASSERT_OK(Flush(cf));
|
||||
// Compact file to L1 to avoid trivial file move in the next compaction
|
||||
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), handles_[cf],
|
||||
&start_slice, &end_slice));
|
||||
ASSERT_OK(Put(cf, "a", "a_value_new"));
|
||||
ASSERT_OK(Flush(cf));
|
||||
ASSERT_OK(Put(cf, "b", "b_value"));
|
||||
ASSERT_OK(Flush(cf));
|
||||
ASSERT_OK(Put(cf, "f", "f_value"));
|
||||
ASSERT_OK(Flush(cf));
|
||||
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), handles_[cf],
|
||||
&start_slice, &end_slice));
|
||||
}
|
||||
|
||||
dbfull()->TEST_DeleteObsoleteFiles();
|
||||
|
||||
// Delete the most recent L0 file which is before a compaction.
|
||||
for (int i = 0; i < num_cfs; ++i) {
|
||||
std::vector<std::string>& files =
|
||||
flush_table_listener->GetFlushedFiles(all_cf_names[i]);
|
||||
ASSERT_EQ(4, files.size());
|
||||
ASSERT_OK(env_->DeleteFile(files[files.size() - 1]));
|
||||
if (options.enable_blob_files) {
|
||||
std::vector<std::string>& blob_files =
|
||||
flush_blob_listener->GetFlushedBlobFiles(all_cf_names[i]);
|
||||
ASSERT_EQ(4, blob_files.size());
|
||||
if (delete_blob_file_too) {
|
||||
ASSERT_OK(env_->DeleteFile(blob_files[files.size() - 1]));
|
||||
}
|
||||
}
|
||||
}
|
||||
options.best_efforts_recovery = true;
|
||||
ReopenWithColumnFamilies(all_cf_names, options);
|
||||
|
||||
for (int i = 0; i < num_cfs; ++i) {
|
||||
auto cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[i]);
|
||||
ColumnFamilyData* cfd = cfh->cfd();
|
||||
VersionStorageInfo* vstorage = cfd->current()->storage_info();
|
||||
// The L0 file flushed right before the last compaction is missing.
|
||||
ASSERT_EQ(0, vstorage->LevelFiles(0).size());
|
||||
// Only the output of the last compaction is available.
|
||||
ASSERT_EQ(1, vstorage->LevelFiles(1).size());
|
||||
}
|
||||
// Verify data
|
||||
ReadOptions read_opts;
|
||||
read_opts.total_order_seek = true;
|
||||
for (int i = 0; i < num_cfs; ++i) {
|
||||
std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts, handles_[i]));
|
||||
iter->SeekToFirst();
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_OK(iter->status());
|
||||
ASSERT_EQ("a", iter->key());
|
||||
ASSERT_EQ("a_value_new", iter->value());
|
||||
iter->Next();
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_OK(iter->status());
|
||||
ASSERT_EQ("b", iter->key());
|
||||
ASSERT_EQ("b_value", iter->value());
|
||||
iter->Next();
|
||||
ASSERT_FALSE(iter->Valid());
|
||||
ASSERT_OK(iter->status());
|
||||
}
|
||||
|
||||
// Write more data.
|
||||
for (int cf = 0; cf < num_cfs; ++cf) {
|
||||
ASSERT_OK(Put(cf, "g", "g_value"));
|
||||
ASSERT_OK(Flush(cf));
|
||||
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), handles_[cf], nullptr,
|
||||
nullptr));
|
||||
std::string value;
|
||||
ASSERT_OK(db_->Get(ReadOptions(), handles_[cf], "g", &value));
|
||||
ASSERT_EQ("g_value", value);
|
||||
}
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(BestEffortsRecoverIncompleteVersionTest,
|
||||
BestEffortsRecoverIncompleteVersionTest,
|
||||
testing::Values(std::make_tuple(false, false),
|
||||
std::make_tuple(true, false),
|
||||
std::make_tuple(true, true)));
|
||||
|
||||
TEST_F(DBBasicTest, BestEffortsRecoveryTryMultipleManifests) {
|
||||
Options options = CurrentOptions();
|
||||
options.env = env_;
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -289,10 +289,12 @@ TEST_F(DBFollowerTest, RetryCatchup) {
|
|||
{"DBImplFollower::TryCatchupWithLeader:Begin1", "Leader::Start"},
|
||||
{"DBImpl::BackgroundCompaction:Start",
|
||||
"DBImplFollower::TryCatchupWithLeader:Begin2"},
|
||||
{"VersionEditHandlerPointInTime::MaybeCreateVersion:Begin1",
|
||||
{"VersionEditHandlerPointInTime::MaybeCreateVersionBeforeApplyEdit:"
|
||||
"Begin1",
|
||||
"DBImpl::BackgroundCompaction:BeforeCompaction"},
|
||||
{"DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles",
|
||||
"VersionEditHandlerPointInTime::MaybeCreateVersion:Begin2"},
|
||||
"VersionEditHandlerPointInTime::MaybeCreateVersionBeforeApplyEdit:"
|
||||
"Begin2"},
|
||||
{"DBImplFollower::TryCatchupWithLeader:End", "Follower::WaitForCatchup"},
|
||||
});
|
||||
SyncPoint::GetInstance()->EnableProcessing();
|
||||
|
@ -335,10 +337,12 @@ TEST_F(DBFollowerTest, RetryCatchupManifestRollover) {
|
|||
SyncPoint::GetInstance()->LoadDependency({
|
||||
{"DBImplFollower::TryCatchupWithLeader:Begin1", "Leader::Start"},
|
||||
{"Leader::Flushed", "DBImplFollower::TryCatchupWithLeader:Begin2"},
|
||||
{"VersionEditHandlerPointInTime::MaybeCreateVersion:Begin1",
|
||||
{"VersionEditHandlerPointInTime::MaybeCreateVersionBeforeApplyEdit:"
|
||||
"Begin1",
|
||||
"Leader::Done"},
|
||||
{"DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles",
|
||||
"VersionEditHandlerPointInTime::MaybeCreateVersion:Begin2"},
|
||||
"VersionEditHandlerPointInTime::MaybeCreateVersionBeforeApplyEdit:"
|
||||
"Begin2"},
|
||||
{"DBImplFollower::TryCatchupWithLeader:End",
|
||||
"Follower::WaitForCatchup:1"},
|
||||
});
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
#include <cstdio>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <optional>
|
||||
#include <set>
|
||||
#include <sstream>
|
||||
#include <stdexcept>
|
||||
|
@ -2475,7 +2476,8 @@ Status DBImpl::GetImpl(const ReadOptions& read_options, const Slice& key,
|
|||
RecordTick(stats_, MEMTABLE_HIT);
|
||||
}
|
||||
}
|
||||
if (!done && !s.ok() && !s.IsMergeInProgress()) {
|
||||
if (!s.ok() && !s.IsMergeInProgress() && !s.IsNotFound()) {
|
||||
assert(done);
|
||||
ReturnAndCleanupSuperVersion(cfd, sv);
|
||||
return s;
|
||||
}
|
||||
|
@ -3141,10 +3143,11 @@ Status DBImpl::MultiGetImpl(
|
|||
StopWatch sw(immutable_db_options_.clock, stats_, DB_MULTIGET);
|
||||
|
||||
assert(sorted_keys);
|
||||
assert(start_key + num_keys <= sorted_keys->size());
|
||||
// Clear the timestamps for returning results so that we can distinguish
|
||||
// between tombstone or key that has never been written
|
||||
for (auto* kctx : *sorted_keys) {
|
||||
assert(kctx);
|
||||
for (size_t i = start_key; i < start_key + num_keys; ++i) {
|
||||
KeyContext* kctx = (*sorted_keys)[i];
|
||||
if (kctx->timestamp) {
|
||||
kctx->timestamp->clear();
|
||||
}
|
||||
|
@ -5240,6 +5243,14 @@ Status DestroyDB(const std::string& dbname, const Options& options,
|
|||
Env* env = soptions.env;
|
||||
std::vector<std::string> filenames;
|
||||
bool wal_in_db_path = soptions.IsWalDirSameAsDBPath();
|
||||
auto sfm = static_cast_with_check<SstFileManagerImpl>(
|
||||
options.sst_file_manager.get());
|
||||
// Allocate a separate trash bucket to be used by all the to be deleted
|
||||
// files, so we can later wait for this bucket to be empty before return.
|
||||
std::optional<int32_t> bucket;
|
||||
if (sfm) {
|
||||
bucket = sfm->NewTrashBucket();
|
||||
}
|
||||
|
||||
// Reset the logger because it holds a handle to the
|
||||
// log file and prevents cleanup and directory removal
|
||||
|
@ -5251,6 +5262,7 @@ Status DestroyDB(const std::string& dbname, const Options& options,
|
|||
/*IODebugContext*=*/nullptr)
|
||||
.PermitUncheckedError();
|
||||
|
||||
std::set<std::string> paths_to_delete;
|
||||
FileLock* lock;
|
||||
const std::string lockname = LockFileName(dbname);
|
||||
Status result = env->LockFile(lockname, &lock);
|
||||
|
@ -5267,10 +5279,9 @@ Status DestroyDB(const std::string& dbname, const Options& options,
|
|||
del = DestroyDB(path_to_delete, options);
|
||||
} else if (type == kTableFile || type == kWalFile ||
|
||||
type == kBlobFile) {
|
||||
del = DeleteDBFile(
|
||||
&soptions, path_to_delete, dbname,
|
||||
/*force_bg=*/false,
|
||||
/*force_fg=*/(type == kWalFile) ? !wal_in_db_path : false);
|
||||
del = DeleteUnaccountedDBFile(&soptions, path_to_delete, dbname,
|
||||
/*force_bg=*/false,
|
||||
/*force_fg=*/false, bucket);
|
||||
} else {
|
||||
del = env->DeleteFile(path_to_delete);
|
||||
}
|
||||
|
@ -5279,6 +5290,7 @@ Status DestroyDB(const std::string& dbname, const Options& options,
|
|||
}
|
||||
}
|
||||
}
|
||||
paths_to_delete.insert(dbname);
|
||||
|
||||
std::set<std::string> paths;
|
||||
for (const DbPath& db_path : options.db_paths) {
|
||||
|
@ -5300,18 +5312,19 @@ Status DestroyDB(const std::string& dbname, const Options& options,
|
|||
(type == kTableFile ||
|
||||
type == kBlobFile)) { // Lock file will be deleted at end
|
||||
std::string file_path = path + "/" + fname;
|
||||
Status del = DeleteDBFile(&soptions, file_path, dbname,
|
||||
/*force_bg=*/false, /*force_fg=*/false);
|
||||
Status del = DeleteUnaccountedDBFile(&soptions, file_path, dbname,
|
||||
/*force_bg=*/false,
|
||||
/*force_fg=*/false, bucket);
|
||||
if (!del.ok() && result.ok()) {
|
||||
result = del;
|
||||
}
|
||||
}
|
||||
}
|
||||
// TODO: Should we return an error if we cannot delete the directory?
|
||||
env->DeleteDir(path).PermitUncheckedError();
|
||||
}
|
||||
}
|
||||
|
||||
paths_to_delete.merge(paths);
|
||||
|
||||
std::vector<std::string> walDirFiles;
|
||||
std::string archivedir = ArchivalDirectory(dbname);
|
||||
bool wal_dir_exists = false;
|
||||
|
@ -5335,46 +5348,49 @@ Status DestroyDB(const std::string& dbname, const Options& options,
|
|||
// Delete archival files.
|
||||
for (const auto& file : archiveFiles) {
|
||||
if (ParseFileName(file, &number, &type) && type == kWalFile) {
|
||||
Status del =
|
||||
DeleteDBFile(&soptions, archivedir + "/" + file, archivedir,
|
||||
/*force_bg=*/false, /*force_fg=*/!wal_in_db_path);
|
||||
Status del = DeleteUnaccountedDBFile(
|
||||
&soptions, archivedir + "/" + file, archivedir,
|
||||
/*force_bg=*/false, /*force_fg=*/!wal_in_db_path, bucket);
|
||||
if (!del.ok() && result.ok()) {
|
||||
result = del;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Ignore error in case dir contains other files
|
||||
env->DeleteDir(archivedir).PermitUncheckedError();
|
||||
paths_to_delete.insert(archivedir);
|
||||
}
|
||||
|
||||
// Delete log files in the WAL dir
|
||||
if (wal_dir_exists) {
|
||||
for (const auto& file : walDirFiles) {
|
||||
if (ParseFileName(file, &number, &type) && type == kWalFile) {
|
||||
Status del =
|
||||
DeleteDBFile(&soptions, LogFileName(soptions.wal_dir, number),
|
||||
soptions.wal_dir, /*force_bg=*/false,
|
||||
/*force_fg=*/!wal_in_db_path);
|
||||
Status del = DeleteUnaccountedDBFile(
|
||||
&soptions, LogFileName(soptions.wal_dir, number),
|
||||
soptions.wal_dir, /*force_bg=*/false,
|
||||
/*force_fg=*/!wal_in_db_path, bucket);
|
||||
if (!del.ok() && result.ok()) {
|
||||
result = del;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Ignore error in case dir contains other files
|
||||
env->DeleteDir(soptions.wal_dir).PermitUncheckedError();
|
||||
paths_to_delete.insert(soptions.wal_dir);
|
||||
}
|
||||
|
||||
// Ignore error since state is already gone
|
||||
env->UnlockFile(lock).PermitUncheckedError();
|
||||
env->DeleteFile(lockname).PermitUncheckedError();
|
||||
|
||||
// Make sure trash files are all cleared before return.
|
||||
if (sfm && bucket.has_value()) {
|
||||
sfm->WaitForEmptyTrashBucket(bucket.value());
|
||||
}
|
||||
// sst_file_manager holds a ref to the logger. Make sure the logger is
|
||||
// gone before trying to remove the directory.
|
||||
soptions.sst_file_manager.reset();
|
||||
|
||||
// Ignore error in case dir contains other files
|
||||
env->DeleteDir(dbname).PermitUncheckedError();
|
||||
;
|
||||
for (const auto& path_to_delete : paths_to_delete) {
|
||||
env->DeleteDir(path_to_delete).PermitUncheckedError();
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
@ -5820,11 +5836,6 @@ Status DBImpl::IngestExternalFiles(
|
|||
"write_global_seqno is deprecated and does not work with "
|
||||
"allow_db_generated_files.");
|
||||
}
|
||||
if (ingest_opts.move_files) {
|
||||
return Status::NotSupported(
|
||||
"Options move_files and allow_db_generated_files are not "
|
||||
"compatible.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -1226,6 +1226,8 @@ class DBImpl : public DB {
|
|||
return logs_.back().number;
|
||||
}
|
||||
|
||||
void TEST_DeleteObsoleteFiles();
|
||||
|
||||
const std::unordered_set<uint64_t>& TEST_GetFilesGrabbedForPurge() const {
|
||||
return files_grabbed_for_purge_;
|
||||
}
|
||||
|
|
|
@ -314,6 +314,11 @@ const autovector<uint64_t>& DBImpl::TEST_GetFilesToQuarantine() const {
|
|||
return error_handler_.GetFilesToQuarantine();
|
||||
}
|
||||
|
||||
void DBImpl::TEST_DeleteObsoleteFiles() {
|
||||
InstrumentedMutexLock l(&mutex_);
|
||||
DeleteObsoleteFiles();
|
||||
}
|
||||
|
||||
size_t DBImpl::TEST_EstimateInMemoryStatsHistorySize() const {
|
||||
InstrumentedMutexLock l(&const_cast<DBImpl*>(this)->stats_history_mutex_);
|
||||
return EstimateInMemoryStatsHistorySize();
|
||||
|
|
|
@ -970,7 +970,9 @@ Status DBImpl::SetupDBId(const WriteOptions& write_options, bool read_only,
|
|||
}
|
||||
// Persist it to IDENTITY file if allowed
|
||||
if (!read_only) {
|
||||
s = SetIdentityFile(write_options, env_, dbname_, db_id_);
|
||||
s = SetIdentityFile(write_options, env_, dbname_,
|
||||
immutable_db_options_.metadata_write_temperature,
|
||||
db_id_);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
|
|
@ -295,7 +295,8 @@ Status DBImpl::ValidateOptions(const DBOptions& db_options) {
|
|||
Status DBImpl::NewDB(std::vector<std::string>* new_filenames) {
|
||||
VersionEdit new_db;
|
||||
const WriteOptions write_options(Env::IOActivity::kDBOpen);
|
||||
Status s = SetIdentityFile(write_options, env_, dbname_);
|
||||
Status s = SetIdentityFile(write_options, env_, dbname_,
|
||||
immutable_db_options_.metadata_write_temperature);
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
|
@ -319,6 +320,12 @@ Status DBImpl::NewDB(std::vector<std::string>* new_filenames) {
|
|||
}
|
||||
std::unique_ptr<FSWritableFile> file;
|
||||
FileOptions file_options = fs_->OptimizeForManifestWrite(file_options_);
|
||||
// DB option takes precedence when not kUnknown
|
||||
if (immutable_db_options_.metadata_write_temperature !=
|
||||
Temperature::kUnknown) {
|
||||
file_options.temperature =
|
||||
immutable_db_options_.metadata_write_temperature;
|
||||
}
|
||||
s = NewWritableFile(fs_.get(), manifest, &file, file_options);
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
|
@ -344,6 +351,7 @@ Status DBImpl::NewDB(std::vector<std::string>* new_filenames) {
|
|||
if (s.ok()) {
|
||||
// Make "CURRENT" file that points to the new manifest file.
|
||||
s = SetCurrentFile(write_options, fs_.get(), dbname_, 1,
|
||||
immutable_db_options_.metadata_write_temperature,
|
||||
directories_.GetDbDir());
|
||||
if (new_filenames) {
|
||||
new_filenames->emplace_back(
|
||||
|
@ -530,6 +538,12 @@ Status DBImpl::Recover(
|
|||
/*no_error_if_files_missing=*/false, is_retry,
|
||||
&desc_status);
|
||||
desc_status.PermitUncheckedError();
|
||||
if (is_retry) {
|
||||
RecordTick(stats_, FILE_READ_CORRUPTION_RETRY_COUNT);
|
||||
if (desc_status.ok()) {
|
||||
RecordTick(stats_, FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT);
|
||||
}
|
||||
}
|
||||
if (can_retry) {
|
||||
// If we're opening for the first time and the failure is likely due to
|
||||
// a corrupt MANIFEST file (could result in either the log::Reader
|
||||
|
@ -1930,6 +1944,10 @@ IOStatus DBImpl::CreateWAL(const WriteOptions& write_options,
|
|||
BuildDBOptions(immutable_db_options_, mutable_db_options_);
|
||||
FileOptions opt_file_options =
|
||||
fs_->OptimizeForLogWrite(file_options_, db_options);
|
||||
// DB option takes precedence when not kUnknown
|
||||
if (immutable_db_options_.wal_write_temperature != Temperature::kUnknown) {
|
||||
opt_file_options.temperature = immutable_db_options_.wal_write_temperature;
|
||||
}
|
||||
std::string wal_dir = immutable_db_options_.GetWalDir();
|
||||
std::string log_fname = LogFileName(wal_dir, log_file_num);
|
||||
|
||||
|
|
|
@ -969,21 +969,17 @@ Status DBImpl::WriteImplWALOnly(
|
|||
assert(w.state == WriteThread::STATE_GROUP_LEADER);
|
||||
|
||||
if (publish_last_seq == kDoPublishLastSeq) {
|
||||
Status status;
|
||||
|
||||
// Currently we only use kDoPublishLastSeq in unordered_write
|
||||
assert(immutable_db_options_.unordered_write);
|
||||
WriteContext write_context;
|
||||
if (error_handler_.IsDBStopped()) {
|
||||
status = error_handler_.GetBGError();
|
||||
}
|
||||
|
||||
// TODO(myabandeh): Make preliminary checks thread-safe so we could do them
|
||||
// without paying the cost of obtaining the mutex.
|
||||
if (status.ok()) {
|
||||
LogContext log_context;
|
||||
status = PreprocessWrite(write_options, &log_context, &write_context);
|
||||
WriteStatusCheckOnLocked(status);
|
||||
}
|
||||
LogContext log_context;
|
||||
WriteContext write_context;
|
||||
Status status =
|
||||
PreprocessWrite(write_options, &log_context, &write_context);
|
||||
WriteStatusCheckOnLocked(status);
|
||||
|
||||
if (!status.ok()) {
|
||||
WriteThread::WriteGroup write_group;
|
||||
write_thread->EnterAsBatchGroupLeader(&w, &write_group);
|
||||
|
|
|
@ -705,6 +705,7 @@ class DBIOCorruptionTest
|
|||
DBIOCorruptionTest() : DBIOFailureTest() {
|
||||
BlockBasedTableOptions bbto;
|
||||
options_ = CurrentOptions();
|
||||
options_.statistics = CreateDBStatistics();
|
||||
|
||||
base_env_ = env_;
|
||||
EXPECT_NE(base_env_, nullptr);
|
||||
|
@ -727,6 +728,8 @@ class DBIOCorruptionTest
|
|||
|
||||
Status ReopenDB() { return TryReopen(options_); }
|
||||
|
||||
Statistics* stats() { return options_.statistics.get(); }
|
||||
|
||||
protected:
|
||||
std::unique_ptr<Env> env_guard_;
|
||||
std::shared_ptr<CorruptionFS> fs_;
|
||||
|
@ -749,8 +752,12 @@ TEST_P(DBIOCorruptionTest, GetReadCorruptionRetry) {
|
|||
if (std::get<2>(GetParam())) {
|
||||
ASSERT_OK(s);
|
||||
ASSERT_EQ(val, "val1");
|
||||
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 1);
|
||||
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT),
|
||||
1);
|
||||
} else {
|
||||
ASSERT_TRUE(s.IsCorruption());
|
||||
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 0);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -773,8 +780,12 @@ TEST_P(DBIOCorruptionTest, IterReadCorruptionRetry) {
|
|||
}
|
||||
if (std::get<2>(GetParam())) {
|
||||
ASSERT_OK(iter->status());
|
||||
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 1);
|
||||
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT),
|
||||
1);
|
||||
} else {
|
||||
ASSERT_TRUE(iter->status().IsCorruption());
|
||||
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 0);
|
||||
}
|
||||
delete iter;
|
||||
}
|
||||
|
@ -799,9 +810,13 @@ TEST_P(DBIOCorruptionTest, MultiGetReadCorruptionRetry) {
|
|||
if (std::get<2>(GetParam())) {
|
||||
ASSERT_EQ(values[0].ToString(), "val1");
|
||||
ASSERT_EQ(values[1].ToString(), "val2");
|
||||
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 1);
|
||||
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT),
|
||||
1);
|
||||
} else {
|
||||
ASSERT_TRUE(statuses[0].IsCorruption());
|
||||
ASSERT_TRUE(statuses[1].IsCorruption());
|
||||
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 0);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -818,6 +833,9 @@ TEST_P(DBIOCorruptionTest, CompactionReadCorruptionRetry) {
|
|||
Status s = dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
|
||||
if (std::get<2>(GetParam())) {
|
||||
ASSERT_OK(s);
|
||||
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 1);
|
||||
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT),
|
||||
1);
|
||||
|
||||
std::string val;
|
||||
ReadOptions ro;
|
||||
|
@ -826,6 +844,7 @@ TEST_P(DBIOCorruptionTest, CompactionReadCorruptionRetry) {
|
|||
ASSERT_EQ(val, "val1");
|
||||
} else {
|
||||
ASSERT_TRUE(s.IsCorruption());
|
||||
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 0);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -838,6 +857,9 @@ TEST_P(DBIOCorruptionTest, FlushReadCorruptionRetry) {
|
|||
Status s = Flush();
|
||||
if (std::get<2>(GetParam())) {
|
||||
ASSERT_OK(s);
|
||||
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 1);
|
||||
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT),
|
||||
1);
|
||||
|
||||
std::string val;
|
||||
ReadOptions ro;
|
||||
|
@ -846,6 +868,7 @@ TEST_P(DBIOCorruptionTest, FlushReadCorruptionRetry) {
|
|||
ASSERT_EQ(val, "val1");
|
||||
} else {
|
||||
ASSERT_NOK(s);
|
||||
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 0);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -862,8 +885,12 @@ TEST_P(DBIOCorruptionTest, ManifestCorruptionRetry) {
|
|||
|
||||
if (std::get<2>(GetParam())) {
|
||||
ASSERT_OK(ReopenDB());
|
||||
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 1);
|
||||
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT),
|
||||
1);
|
||||
} else {
|
||||
ASSERT_EQ(ReopenDB(), Status::Corruption());
|
||||
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 0);
|
||||
}
|
||||
SyncPoint::GetInstance()->DisableProcessing();
|
||||
}
|
||||
|
|
|
@ -684,13 +684,14 @@ class DbMemtableKVChecksumTest : public DbKvChecksumTest {
|
|||
DbMemtableKVChecksumTest() : DbKvChecksumTest() {}
|
||||
|
||||
protected:
|
||||
const size_t kValueLenOffset = 12;
|
||||
// Indices in the memtable entry that we will not corrupt.
|
||||
// For memtable entry format, see comments in MemTable::Add().
|
||||
// We do not corrupt key length and value length fields in this test
|
||||
// case since it causes segfault and ASAN will complain.
|
||||
// For this test case, key and value are all of length 3, so
|
||||
// key length field is at index 0 and value length field is at index 12.
|
||||
const std::set<size_t> index_not_to_corrupt{0, 12};
|
||||
const std::set<size_t> index_not_to_corrupt{0, kValueLenOffset};
|
||||
|
||||
void SkipNotToCorruptEntry() {
|
||||
if (index_not_to_corrupt.find(corrupt_byte_offset_) !=
|
||||
|
@ -737,6 +738,8 @@ TEST_P(DbMemtableKVChecksumTest, GetWithCorruptAfterMemtableInsert) {
|
|||
buf[corrupt_byte_offset_] += corrupt_byte_addend_;
|
||||
++corrupt_byte_offset_;
|
||||
});
|
||||
// Corrupt value only so that MultiGet below can find the key.
|
||||
corrupt_byte_offset_ = kValueLenOffset + 1;
|
||||
SyncPoint::GetInstance()->EnableProcessing();
|
||||
Options options = CurrentOptions();
|
||||
options.memtable_protection_bytes_per_key =
|
||||
|
@ -745,12 +748,17 @@ TEST_P(DbMemtableKVChecksumTest, GetWithCorruptAfterMemtableInsert) {
|
|||
options.merge_operator = MergeOperators::CreateStringAppendOperator();
|
||||
}
|
||||
|
||||
std::string key = "key";
|
||||
SkipNotToCorruptEntry();
|
||||
while (MoreBytesToCorrupt()) {
|
||||
Reopen(options);
|
||||
ASSERT_OK(ExecuteWrite(nullptr));
|
||||
std::string val;
|
||||
ASSERT_TRUE(db_->Get(ReadOptions(), "key", &val).IsCorruption());
|
||||
ASSERT_TRUE(db_->Get(ReadOptions(), key, &val).IsCorruption());
|
||||
std::vector<std::string> vals = {val};
|
||||
std::vector<Status> statuses = db_->MultiGet(
|
||||
ReadOptions(), {db_->DefaultColumnFamily()}, {key}, &vals, nullptr);
|
||||
ASSERT_TRUE(statuses[0].IsCorruption());
|
||||
Destroy(options);
|
||||
SkipNotToCorruptEntry();
|
||||
}
|
||||
|
|
|
@ -339,6 +339,91 @@ TEST_F(DBMemTableTest, ColumnFamilyId) {
|
|||
}
|
||||
}
|
||||
|
||||
TEST_F(DBMemTableTest, IntegrityChecks) {
|
||||
// We insert keys key000000, key000001 and key000002 into skiplist at fixed
|
||||
// height 1 (smallest height). Then we corrupt the second key to aey000001 to
|
||||
// make it smaller. With `paranoid_memory_checks` set to true, if the
|
||||
// skip list sees key000000 and then aey000001, then it will report out of
|
||||
// order keys with corruption status. With `paranoid_memory_checks` set
|
||||
// to false, read/scan may return wrong results.
|
||||
for (bool allow_data_in_error : {false, true}) {
|
||||
Options options = CurrentOptions();
|
||||
options.allow_data_in_errors = allow_data_in_error;
|
||||
options.paranoid_memory_checks = true;
|
||||
DestroyAndReopen(options);
|
||||
SyncPoint::GetInstance()->SetCallBack(
|
||||
"InlineSkipList::RandomHeight::height", [](void* h) {
|
||||
auto height_ptr = static_cast<int*>(h);
|
||||
*height_ptr = 1;
|
||||
});
|
||||
SyncPoint::GetInstance()->EnableProcessing();
|
||||
ASSERT_OK(Put(Key(0), "val0"));
|
||||
ASSERT_OK(Put(Key(2), "val2"));
|
||||
// p will point to the buffer for encoded key000001
|
||||
char* p = nullptr;
|
||||
SyncPoint::GetInstance()->SetCallBack(
|
||||
"MemTable::Add:BeforeReturn:Encoded", [&](void* encoded) {
|
||||
p = const_cast<char*>(static_cast<Slice*>(encoded)->data());
|
||||
});
|
||||
ASSERT_OK(Put(Key(1), "val1"));
|
||||
SyncPoint::GetInstance()->DisableProcessing();
|
||||
SyncPoint::GetInstance()->ClearAllCallBacks();
|
||||
ASSERT_TRUE(p);
|
||||
// Offset 0 is key size, key bytes start at offset 1.
|
||||
// "key000001 -> aey000001"
|
||||
p[1] = 'a';
|
||||
|
||||
ReadOptions rops;
|
||||
std::string val;
|
||||
Status s = db_->Get(rops, Key(1), &val);
|
||||
ASSERT_TRUE(s.IsCorruption());
|
||||
std::string key0 = Slice(Key(0)).ToString(true);
|
||||
ASSERT_EQ(s.ToString().find(key0) != std::string::npos,
|
||||
allow_data_in_error);
|
||||
// Without `paranoid_memory_checks`, NotFound will be returned.
|
||||
// This would fail an assertion in InlineSkipList::FindGreaterOrEqual().
|
||||
// If we remove the assertion, this passes.
|
||||
// ASSERT_TRUE(db_->Get(ReadOptions(), Key(1), &val).IsNotFound());
|
||||
|
||||
std::vector<std::string> vals;
|
||||
std::vector<Status> statuses = db_->MultiGet(
|
||||
rops, {db_->DefaultColumnFamily()}, {Key(1)}, &vals, nullptr);
|
||||
ASSERT_TRUE(statuses[0].IsCorruption());
|
||||
ASSERT_EQ(statuses[0].ToString().find(key0) != std::string::npos,
|
||||
allow_data_in_error);
|
||||
|
||||
std::unique_ptr<Iterator> iter{db_->NewIterator(rops)};
|
||||
ASSERT_OK(iter->status());
|
||||
iter->Seek(Key(1));
|
||||
ASSERT_TRUE(iter->status().IsCorruption());
|
||||
ASSERT_EQ(iter->status().ToString().find(key0) != std::string::npos,
|
||||
allow_data_in_error);
|
||||
|
||||
iter->Seek(Key(0));
|
||||
ASSERT_TRUE(iter->Valid());
|
||||
ASSERT_OK(iter->status());
|
||||
// iterating through skip list at height at 1 should catch out-of-order keys
|
||||
iter->Next();
|
||||
ASSERT_TRUE(iter->status().IsCorruption());
|
||||
ASSERT_EQ(iter->status().ToString().find(key0) != std::string::npos,
|
||||
allow_data_in_error);
|
||||
ASSERT_FALSE(iter->Valid());
|
||||
|
||||
iter->SeekForPrev(Key(2));
|
||||
ASSERT_TRUE(iter->status().IsCorruption());
|
||||
ASSERT_EQ(iter->status().ToString().find(key0) != std::string::npos,
|
||||
allow_data_in_error);
|
||||
|
||||
// Internally DB Iter will iterate backwards (call Prev()) after
|
||||
// SeekToLast() to find the correct internal key with the last user key.
|
||||
// Prev() will do integrity checks and catch corruption.
|
||||
iter->SeekToLast();
|
||||
ASSERT_TRUE(iter->status().IsCorruption());
|
||||
ASSERT_EQ(iter->status().ToString().find(key0) != std::string::npos,
|
||||
allow_data_in_error);
|
||||
ASSERT_FALSE(iter->Valid());
|
||||
}
|
||||
}
|
||||
} // namespace ROCKSDB_NAMESPACE
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
|
|
|
@ -507,6 +507,23 @@ TEST_F(DBSSTTest, DBWithSstFileManagerForBlobFiles) {
|
|||
ASSERT_EQ(files_deleted, 0);
|
||||
ASSERT_EQ(files_scheduled_to_delete, 0);
|
||||
Close();
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
||||
"SstFileManagerImpl::ScheduleUnaccountedFileDeletion", [&](void* arg) {
|
||||
assert(arg);
|
||||
const std::string* const file_path =
|
||||
static_cast<const std::string*>(arg);
|
||||
if (EndsWith(*file_path, ".blob")) {
|
||||
++files_scheduled_to_delete;
|
||||
}
|
||||
});
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
||||
"DeleteScheduler::OnDeleteFile", [&](void* arg) {
|
||||
const std::string* const file_path =
|
||||
static_cast<const std::string*>(arg);
|
||||
if (EndsWith(*file_path, ".blob")) {
|
||||
files_deleted++;
|
||||
}
|
||||
});
|
||||
ASSERT_OK(DestroyDB(dbname_, options));
|
||||
ASSERT_EQ(files_deleted, blob_files.size());
|
||||
ASSERT_EQ(files_scheduled_to_delete, blob_files.size());
|
||||
|
@ -649,6 +666,23 @@ TEST_F(DBSSTTest, DBWithSstFileManagerForBlobFilesWithGC) {
|
|||
}
|
||||
|
||||
Close();
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
||||
"SstFileManagerImpl::ScheduleUnaccountedFileDeletion", [&](void* arg) {
|
||||
assert(arg);
|
||||
const std::string* const file_path =
|
||||
static_cast<const std::string*>(arg);
|
||||
if (EndsWith(*file_path, ".blob")) {
|
||||
++files_scheduled_to_delete;
|
||||
}
|
||||
});
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
||||
"DeleteScheduler::OnDeleteFile", [&](void* arg) {
|
||||
const std::string* const file_path =
|
||||
static_cast<const std::string*>(arg);
|
||||
if (EndsWith(*file_path, ".blob")) {
|
||||
files_deleted++;
|
||||
}
|
||||
});
|
||||
ASSERT_OK(DestroyDB(dbname_, options));
|
||||
sfm->WaitForEmptyTrash();
|
||||
ASSERT_EQ(files_deleted, 5);
|
||||
|
@ -883,8 +917,9 @@ TEST_P(DBWALTestWithParam, WALTrashCleanupOnOpen) {
|
|||
// Create 4 files in L0
|
||||
for (char v = 'a'; v <= 'd'; v++) {
|
||||
if (v == 'c') {
|
||||
// Maximize the change that the last log file will be preserved in trash
|
||||
// before restarting the DB.
|
||||
// Maximize the chance that the last log file will be preserved in trash
|
||||
// before restarting the DB. (Enable slow deletion but at a very slow
|
||||
// deletion rate)
|
||||
// We have to set this on the 2nd to last file for it to delay deletion
|
||||
// on the last file. (Quirk of DeleteScheduler::BackgroundEmptyTrash())
|
||||
options.sst_file_manager->SetDeleteRateBytesPerSecond(1);
|
||||
|
@ -1902,6 +1937,24 @@ TEST_F(DBSSTTest, DBWithSFMForBlobFilesAtomicFlush) {
|
|||
ASSERT_EQ(files_deleted, 1);
|
||||
|
||||
Close();
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
||||
"SstFileManagerImpl::ScheduleUnaccountedFileDeletion", [&](void* arg) {
|
||||
assert(arg);
|
||||
const std::string* const file_path =
|
||||
static_cast<const std::string*>(arg);
|
||||
if (EndsWith(*file_path, ".blob")) {
|
||||
++files_scheduled_to_delete;
|
||||
}
|
||||
});
|
||||
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
||||
"DeleteScheduler::OnDeleteFile", [&](void* arg) {
|
||||
const std::string* const file_path =
|
||||
static_cast<const std::string*>(arg);
|
||||
if (EndsWith(*file_path, ".blob")) {
|
||||
files_deleted++;
|
||||
}
|
||||
});
|
||||
ASSERT_OK(DestroyDB(dbname_, options));
|
||||
|
||||
ASSERT_EQ(files_scheduled_to_delete, 4);
|
||||
|
|
231
db/db_test2.cc
231
db/db_test2.cc
|
@ -10,6 +10,7 @@
|
|||
#include <atomic>
|
||||
#include <cstdlib>
|
||||
#include <functional>
|
||||
#include <iostream>
|
||||
#include <memory>
|
||||
|
||||
#include "db/db_test_util.h"
|
||||
|
@ -26,6 +27,7 @@
|
|||
#include "rocksdb/utilities/replayer.h"
|
||||
#include "rocksdb/wal_filter.h"
|
||||
#include "test_util/testutil.h"
|
||||
#include "util/defer.h"
|
||||
#include "util/random.h"
|
||||
#include "utilities/fault_injection_env.h"
|
||||
|
||||
|
@ -6544,6 +6546,235 @@ TEST_P(RenameCurrentTest, Compaction) {
|
|||
ASSERT_EQ("d_value", Get("d"));
|
||||
}
|
||||
|
||||
TEST_F(DBTest2, VariousFileTemperatures) {
|
||||
constexpr size_t kNumberFileTypes = static_cast<size_t>(kBlobFile) + 1U;
|
||||
|
||||
struct MyTestFS : public FileTemperatureTestFS {
|
||||
explicit MyTestFS(const std::shared_ptr<FileSystem>& fs)
|
||||
: FileTemperatureTestFS(fs) {
|
||||
Reset();
|
||||
}
|
||||
|
||||
IOStatus NewWritableFile(const std::string& fname, const FileOptions& opts,
|
||||
std::unique_ptr<FSWritableFile>* result,
|
||||
IODebugContext* dbg) override {
|
||||
IOStatus ios =
|
||||
FileTemperatureTestFS::NewWritableFile(fname, opts, result, dbg);
|
||||
if (ios.ok()) {
|
||||
uint64_t number;
|
||||
FileType type;
|
||||
if (ParseFileName(GetFileName(fname), &number, "LOG", &type)) {
|
||||
if (type == kTableFile) {
|
||||
// Not checked here
|
||||
} else if (type == kWalFile) {
|
||||
if (opts.temperature != expected_wal_temperature) {
|
||||
std::cerr << "Attempt to open " << fname << " with temperature "
|
||||
<< temperature_to_string[opts.temperature]
|
||||
<< " rather than "
|
||||
<< temperature_to_string[expected_wal_temperature]
|
||||
<< std::endl;
|
||||
assert(false);
|
||||
}
|
||||
} else if (type == kDescriptorFile) {
|
||||
if (opts.temperature != expected_manifest_temperature) {
|
||||
std::cerr << "Attempt to open " << fname << " with temperature "
|
||||
<< temperature_to_string[opts.temperature]
|
||||
<< " rather than "
|
||||
<< temperature_to_string[expected_wal_temperature]
|
||||
<< std::endl;
|
||||
assert(false);
|
||||
}
|
||||
} else if (opts.temperature != expected_other_metadata_temperature) {
|
||||
std::cerr << "Attempt to open " << fname << " with temperature "
|
||||
<< temperature_to_string[opts.temperature]
|
||||
<< " rather than "
|
||||
<< temperature_to_string[expected_wal_temperature]
|
||||
<< std::endl;
|
||||
assert(false);
|
||||
}
|
||||
UpdateCount(type, 1);
|
||||
}
|
||||
}
|
||||
return ios;
|
||||
}
|
||||
|
||||
IOStatus RenameFile(const std::string& src, const std::string& dst,
|
||||
const IOOptions& options,
|
||||
IODebugContext* dbg) override {
|
||||
IOStatus ios = FileTemperatureTestFS::RenameFile(src, dst, options, dbg);
|
||||
if (ios.ok()) {
|
||||
uint64_t number;
|
||||
FileType src_type;
|
||||
FileType dst_type;
|
||||
assert(ParseFileName(GetFileName(src), &number, "LOG", &src_type));
|
||||
assert(ParseFileName(GetFileName(dst), &number, "LOG", &dst_type));
|
||||
|
||||
UpdateCount(src_type, -1);
|
||||
UpdateCount(dst_type, 1);
|
||||
}
|
||||
return ios;
|
||||
}
|
||||
|
||||
void UpdateCount(FileType type, int delta) {
|
||||
size_t i = static_cast<size_t>(type);
|
||||
assert(i < kNumberFileTypes);
|
||||
counts[i].FetchAddRelaxed(delta);
|
||||
}
|
||||
|
||||
std::map<FileType, size_t> PopCounts() {
|
||||
std::map<FileType, size_t> ret;
|
||||
for (size_t i = 0; i < kNumberFileTypes; ++i) {
|
||||
int c = counts[i].ExchangeRelaxed(0);
|
||||
if (c > 0) {
|
||||
ret[static_cast<FileType>(i)] = c;
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
FileOptions OptimizeForLogWrite(
|
||||
const FileOptions& file_options,
|
||||
const DBOptions& /*db_options*/) const override {
|
||||
FileOptions opts = file_options;
|
||||
if (optimize_wal_temperature != Temperature::kUnknown) {
|
||||
opts.temperature = optimize_wal_temperature;
|
||||
}
|
||||
return opts;
|
||||
}
|
||||
|
||||
FileOptions OptimizeForManifestWrite(
|
||||
const FileOptions& file_options) const override {
|
||||
FileOptions opts = file_options;
|
||||
if (optimize_manifest_temperature != Temperature::kUnknown) {
|
||||
opts.temperature = optimize_manifest_temperature;
|
||||
}
|
||||
return opts;
|
||||
}
|
||||
|
||||
void Reset() {
|
||||
optimize_manifest_temperature = Temperature::kUnknown;
|
||||
optimize_wal_temperature = Temperature::kUnknown;
|
||||
expected_manifest_temperature = Temperature::kUnknown;
|
||||
expected_other_metadata_temperature = Temperature::kUnknown;
|
||||
expected_wal_temperature = Temperature::kUnknown;
|
||||
for (auto& c : counts) {
|
||||
c.StoreRelaxed(0);
|
||||
}
|
||||
}
|
||||
|
||||
Temperature optimize_manifest_temperature;
|
||||
Temperature optimize_wal_temperature;
|
||||
Temperature expected_manifest_temperature;
|
||||
Temperature expected_other_metadata_temperature;
|
||||
Temperature expected_wal_temperature;
|
||||
std::array<RelaxedAtomic<int>, kNumberFileTypes> counts;
|
||||
};
|
||||
|
||||
// We don't have enough non-unknown temps to confidently distinguish that
|
||||
// a specific setting caused a specific outcome, in a single run. This is a
|
||||
// reasonable work-around without blowing up test time. Only returns
|
||||
// non-unknown temperatures.
|
||||
auto RandomTemp = [] {
|
||||
static std::vector<Temperature> temps = {
|
||||
Temperature::kHot, Temperature::kWarm, Temperature::kCold};
|
||||
return temps[Random::GetTLSInstance()->Uniform(
|
||||
static_cast<int>(temps.size()))];
|
||||
};
|
||||
|
||||
auto test_fs = std::make_shared<MyTestFS>(env_->GetFileSystem());
|
||||
std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, test_fs));
|
||||
for (bool use_optimize : {false, true}) {
|
||||
std::cerr << "use_optimize: " << std::to_string(use_optimize) << std::endl;
|
||||
for (bool use_temp_options : {false, true}) {
|
||||
std::cerr << "use_temp_options: " << std::to_string(use_temp_options)
|
||||
<< std::endl;
|
||||
|
||||
Options options = CurrentOptions();
|
||||
// Currently require for last level temperature
|
||||
options.compaction_style = kCompactionStyleUniversal;
|
||||
options.env = env.get();
|
||||
test_fs->Reset();
|
||||
if (use_optimize) {
|
||||
test_fs->optimize_manifest_temperature = RandomTemp();
|
||||
test_fs->expected_manifest_temperature =
|
||||
test_fs->optimize_manifest_temperature;
|
||||
test_fs->optimize_wal_temperature = RandomTemp();
|
||||
test_fs->expected_wal_temperature = test_fs->optimize_wal_temperature;
|
||||
}
|
||||
if (use_temp_options) {
|
||||
options.metadata_write_temperature = RandomTemp();
|
||||
test_fs->expected_manifest_temperature =
|
||||
options.metadata_write_temperature;
|
||||
test_fs->expected_other_metadata_temperature =
|
||||
options.metadata_write_temperature;
|
||||
options.wal_write_temperature = RandomTemp();
|
||||
test_fs->expected_wal_temperature = options.wal_write_temperature;
|
||||
options.last_level_temperature = RandomTemp();
|
||||
options.default_write_temperature = RandomTemp();
|
||||
}
|
||||
|
||||
DestroyAndReopen(options);
|
||||
Defer closer([&] { Close(); });
|
||||
|
||||
using FTC = std::map<FileType, size_t>;
|
||||
// Files on DB startup
|
||||
ASSERT_EQ(test_fs->PopCounts(), FTC({{kWalFile, 1},
|
||||
{kDescriptorFile, 2},
|
||||
{kCurrentFile, 2},
|
||||
{kIdentityFile, 1},
|
||||
{kOptionsFile, 1}}));
|
||||
|
||||
// Temperature count map
|
||||
using TCM = std::map<Temperature, size_t>;
|
||||
ASSERT_EQ(test_fs->CountCurrentSstFilesByTemp(), TCM({}));
|
||||
|
||||
ASSERT_OK(Put("foo", "1"));
|
||||
ASSERT_OK(Put("bar", "1"));
|
||||
ASSERT_OK(Flush());
|
||||
ASSERT_OK(Put("foo", "2"));
|
||||
ASSERT_OK(Put("bar", "2"));
|
||||
ASSERT_OK(Flush());
|
||||
|
||||
ASSERT_EQ(test_fs->CountCurrentSstFilesByTemp(),
|
||||
TCM({{options.default_write_temperature, 2}}));
|
||||
|
||||
ASSERT_OK(db_->CompactRange({}, nullptr, nullptr));
|
||||
ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork());
|
||||
|
||||
ASSERT_EQ(test_fs->CountCurrentSstFilesByTemp(),
|
||||
TCM({{options.last_level_temperature, 1}}));
|
||||
|
||||
ASSERT_OK(Put("foo", "3"));
|
||||
ASSERT_OK(Put("bar", "3"));
|
||||
ASSERT_OK(Flush());
|
||||
|
||||
// Just in memtable/WAL
|
||||
ASSERT_OK(Put("dog", "3"));
|
||||
|
||||
{
|
||||
TCM expected;
|
||||
expected[options.default_write_temperature] += 1;
|
||||
expected[options.last_level_temperature] += 1;
|
||||
ASSERT_EQ(test_fs->CountCurrentSstFilesByTemp(), expected);
|
||||
}
|
||||
|
||||
// New files during operation
|
||||
ASSERT_EQ(test_fs->PopCounts(), FTC({{kWalFile, 3}, {kTableFile, 4}}));
|
||||
|
||||
Reopen(options);
|
||||
|
||||
// New files during re-open/recovery
|
||||
ASSERT_EQ(test_fs->PopCounts(), FTC({{kWalFile, 1},
|
||||
{kTableFile, 1},
|
||||
{kDescriptorFile, 1},
|
||||
{kCurrentFile, 1},
|
||||
{kOptionsFile, 1}}));
|
||||
|
||||
Destroy(options);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(DBTest2, LastLevelTemperature) {
|
||||
class TestListener : public EventListener {
|
||||
public:
|
||||
|
|
|
@ -366,6 +366,11 @@ Options DBTestBase::GetOptions(
|
|||
table_options.block_cache = NewLRUCache(/* too small */ 1);
|
||||
}
|
||||
|
||||
// Test anticipated new default as much as reasonably possible (and remove
|
||||
// this code when obsolete)
|
||||
assert(!table_options.decouple_partitioned_filters);
|
||||
table_options.decouple_partitioned_filters = true;
|
||||
|
||||
bool can_allow_mmap = IsMemoryMappedAccessSupported();
|
||||
switch (option_config) {
|
||||
case kHashSkipList:
|
||||
|
|
|
@ -831,6 +831,15 @@ class FileTemperatureTestFS : public FileSystemWrapper {
|
|||
return count;
|
||||
}
|
||||
|
||||
std::map<Temperature, size_t> CountCurrentSstFilesByTemp() {
|
||||
MutexLock lock(&mu_);
|
||||
std::map<Temperature, size_t> ret;
|
||||
for (const auto& e : current_sst_file_temperatures_) {
|
||||
ret[e.second]++;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
void OverrideSstFileTemperature(uint64_t number, Temperature temp) {
|
||||
MutexLock lock(&mu_);
|
||||
current_sst_file_temperatures_[number] = temp;
|
||||
|
@ -842,7 +851,7 @@ class FileTemperatureTestFS : public FileSystemWrapper {
|
|||
requested_sst_file_temperatures_;
|
||||
std::map<uint64_t, Temperature> current_sst_file_temperatures_;
|
||||
|
||||
std::string GetFileName(const std::string& fname) {
|
||||
static std::string GetFileName(const std::string& fname) {
|
||||
auto filename = fname.substr(fname.find_last_of(kFilePathSeparator) + 1);
|
||||
// workaround only for Windows that the file path could contain both Windows
|
||||
// FilePathSeparator and '/'
|
||||
|
|
|
@ -213,7 +213,6 @@ TEST_P(DBTestUniversalCompaction, UniversalCompactionTrigger) {
|
|||
options.num_levels = num_levels_;
|
||||
options.write_buffer_size = 105 << 10; // 105KB
|
||||
options.arena_block_size = 4 << 10;
|
||||
options.target_file_size_base = 32 << 10; // 32KB
|
||||
// trigger compaction if there are >= 4 files
|
||||
options.level0_file_num_compaction_trigger = 4;
|
||||
KeepFilterFactory* filter = new KeepFilterFactory(true);
|
||||
|
|
|
@ -1472,6 +1472,126 @@ TEST_F(DBWALTest, SyncMultipleLogs) {
|
|||
ASSERT_OK(dbfull()->SyncWAL());
|
||||
}
|
||||
|
||||
TEST_F(DBWALTest, DISABLED_RecycleMultipleWalsCrash) {
|
||||
Options options = CurrentOptions();
|
||||
options.max_write_buffer_number = 5;
|
||||
options.track_and_verify_wals_in_manifest = true;
|
||||
options.max_bgerror_resume_count = 0; // manual resume
|
||||
options.recycle_log_file_num = 3;
|
||||
options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
|
||||
|
||||
// Disable truncating recycled WALs to new size in posix env
|
||||
// (approximating a crash)
|
||||
SyncPoint::GetInstance()->SetCallBack(
|
||||
"PosixWritableFile::Close",
|
||||
[](void* arg) { *(static_cast<size_t*>(arg)) = 0; });
|
||||
SyncPoint::GetInstance()->EnableProcessing();
|
||||
|
||||
// Re-open with desired options
|
||||
DestroyAndReopen(options);
|
||||
Defer closer([this]() { Close(); });
|
||||
|
||||
// Ensure WAL recycling wasn't sanitized away
|
||||
ASSERT_EQ(db_->GetOptions().recycle_log_file_num,
|
||||
options.recycle_log_file_num);
|
||||
|
||||
// Prepare external files for later ingestion
|
||||
std::string sst_files_dir = dbname_ + "/sst_files/";
|
||||
ASSERT_OK(DestroyDir(env_, sst_files_dir));
|
||||
ASSERT_OK(env_->CreateDir(sst_files_dir));
|
||||
std::string external_file1 = sst_files_dir + "file1.sst";
|
||||
{
|
||||
SstFileWriter sst_file_writer(EnvOptions(), options);
|
||||
ASSERT_OK(sst_file_writer.Open(external_file1));
|
||||
ASSERT_OK(sst_file_writer.Put("external1", "ex1"));
|
||||
ExternalSstFileInfo file_info;
|
||||
ASSERT_OK(sst_file_writer.Finish(&file_info));
|
||||
}
|
||||
std::string external_file2 = sst_files_dir + "file2.sst";
|
||||
{
|
||||
SstFileWriter sst_file_writer(EnvOptions(), options);
|
||||
ASSERT_OK(sst_file_writer.Open(external_file2));
|
||||
ASSERT_OK(sst_file_writer.Put("external2", "ex2"));
|
||||
ExternalSstFileInfo file_info;
|
||||
ASSERT_OK(sst_file_writer.Finish(&file_info));
|
||||
}
|
||||
|
||||
// Populate some WALs to be recycled such that there will be extra data
|
||||
// from an old incarnation of the WAL on recovery
|
||||
ASSERT_OK(db_->PauseBackgroundWork());
|
||||
ASSERT_OK(Put("ignore1", Random::GetTLSInstance()->RandomString(500)));
|
||||
ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_SwitchMemtable());
|
||||
ASSERT_OK(Put("ignore2", Random::GetTLSInstance()->RandomString(500)));
|
||||
ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_SwitchMemtable());
|
||||
ASSERT_OK(db_->ContinueBackgroundWork());
|
||||
ASSERT_OK(Flush());
|
||||
ASSERT_OK(Put("ignore3", Random::GetTLSInstance()->RandomString(500)));
|
||||
ASSERT_OK(Flush());
|
||||
|
||||
// Verify expected log files (still there for recycling)
|
||||
std::vector<FileAttributes> files;
|
||||
int log_count = 0;
|
||||
ASSERT_OK(options.env->GetChildrenFileAttributes(dbname_, &files));
|
||||
for (const auto& f : files) {
|
||||
if (EndsWith(f.name, ".log")) {
|
||||
EXPECT_GT(f.size_bytes, 500);
|
||||
++log_count;
|
||||
}
|
||||
}
|
||||
EXPECT_EQ(log_count, 3);
|
||||
|
||||
// (Re-used recipe) Generate two inactive WALs and one active WAL, with a
|
||||
// gap in sequence numbers to interfere with recovery
|
||||
ASSERT_OK(db_->PauseBackgroundWork());
|
||||
ASSERT_OK(Put("key1", "val1"));
|
||||
ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_SwitchMemtable());
|
||||
ASSERT_OK(Put("key2", "val2"));
|
||||
ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_SwitchMemtable());
|
||||
// Need a gap in sequence numbers, so e.g. ingest external file
|
||||
// with an open snapshot
|
||||
{
|
||||
ManagedSnapshot snapshot(db_);
|
||||
ASSERT_OK(
|
||||
db_->IngestExternalFile({external_file1}, IngestExternalFileOptions()));
|
||||
}
|
||||
ASSERT_OK(Put("key3", "val3"));
|
||||
ASSERT_OK(db_->SyncWAL());
|
||||
// Need an SST file that is logically after that WAL, so that dropping WAL
|
||||
// data is not a valid point in time.
|
||||
{
|
||||
ManagedSnapshot snapshot(db_);
|
||||
ASSERT_OK(
|
||||
db_->IngestExternalFile({external_file2}, IngestExternalFileOptions()));
|
||||
}
|
||||
|
||||
// Approximate a crash, with respect to recycled WAL data extending past
|
||||
// the end of the current WAL data (see SyncPoint callback above)
|
||||
Close();
|
||||
|
||||
// Verify recycled log files haven't been truncated
|
||||
files.clear();
|
||||
log_count = 0;
|
||||
ASSERT_OK(options.env->GetChildrenFileAttributes(dbname_, &files));
|
||||
for (const auto& f : files) {
|
||||
if (EndsWith(f.name, ".log")) {
|
||||
EXPECT_GT(f.size_bytes, 500);
|
||||
++log_count;
|
||||
}
|
||||
}
|
||||
EXPECT_EQ(log_count, 3);
|
||||
|
||||
// Verify no data loss after reopen.
|
||||
Reopen(options);
|
||||
EXPECT_EQ("val1", Get("key1"));
|
||||
EXPECT_EQ("val2", Get("key2")); // Passes because of adjacent seqnos
|
||||
EXPECT_EQ("ex1", Get("external1"));
|
||||
EXPECT_EQ("val3", Get("key3")); // <- ONLY FAILURE! (Not a point in time)
|
||||
EXPECT_EQ("ex2", Get("external2"));
|
||||
|
||||
SyncPoint::GetInstance()->DisableProcessing();
|
||||
SyncPoint::GetInstance()->ClearAllCallBacks();
|
||||
}
|
||||
|
||||
TEST_F(DBWALTest, SyncWalPartialFailure) {
|
||||
class MyTestFileSystem : public FileSystemWrapper {
|
||||
public:
|
||||
|
@ -1532,7 +1652,7 @@ TEST_F(DBWALTest, SyncWalPartialFailure) {
|
|||
// * one inactive WAL, not synced, and
|
||||
// * one active WAL, not synced
|
||||
// with a single thread, to exercise as much logic as we reasonably can.
|
||||
ASSERT_OK(static_cast_with_check<DBImpl>(db_)->PauseBackgroundWork());
|
||||
ASSERT_OK(db_->PauseBackgroundWork());
|
||||
ASSERT_OK(Put("key1", "val1"));
|
||||
ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_SwitchMemtable());
|
||||
ASSERT_OK(db_->SyncWAL());
|
||||
|
|
|
@ -172,6 +172,70 @@ TEST_F(DBBasicTestWithTimestamp, MixedCfs) {
|
|||
Close();
|
||||
}
|
||||
|
||||
TEST_F(DBBasicTestWithTimestamp, MultiGetMultipleCfs) {
|
||||
const size_t kTimestampSize = Timestamp(0, 0).size();
|
||||
TestComparator test_cmp(kTimestampSize);
|
||||
Options options = CurrentOptions();
|
||||
options.env = env_;
|
||||
options.create_if_missing = true;
|
||||
options.avoid_flush_during_shutdown = true;
|
||||
options.comparator = &test_cmp;
|
||||
DestroyAndReopen(options);
|
||||
|
||||
Options options1 = CurrentOptions();
|
||||
options1.env = env_;
|
||||
options1.comparator = &test_cmp;
|
||||
ColumnFamilyHandle* handle = nullptr;
|
||||
Status s = db_->CreateColumnFamily(options1, "data", &handle);
|
||||
ASSERT_OK(s);
|
||||
|
||||
std::string ts = Timestamp(1, 0);
|
||||
WriteBatch wb(0, 0, 0, kTimestampSize);
|
||||
ASSERT_OK(wb.Put("a", "value"));
|
||||
ASSERT_OK(wb.Put(handle, "a", "value"));
|
||||
const auto ts_sz_func = [kTimestampSize](uint32_t /*cf_id*/) {
|
||||
return kTimestampSize;
|
||||
};
|
||||
ASSERT_OK(wb.UpdateTimestamps(ts, ts_sz_func));
|
||||
ASSERT_OK(db_->Write(WriteOptions(), &wb));
|
||||
|
||||
int num_keys = 2;
|
||||
std::vector<Slice> keys;
|
||||
std::vector<std::string> expected_values;
|
||||
for (int i = 0; i < num_keys; i++) {
|
||||
keys.push_back("a");
|
||||
expected_values.push_back("value");
|
||||
}
|
||||
std::vector<ColumnFamilyHandle*> handles;
|
||||
handles.push_back(db_->DefaultColumnFamily());
|
||||
handles.push_back(handle);
|
||||
|
||||
{
|
||||
Slice read_ts_slice(ts);
|
||||
ReadOptions read_opts;
|
||||
read_opts.timestamp = &read_ts_slice;
|
||||
|
||||
std::vector<PinnableSlice> values;
|
||||
values.resize(num_keys);
|
||||
std::vector<Status> statuses;
|
||||
statuses.resize(num_keys);
|
||||
std::vector<std::string> timestamps;
|
||||
timestamps.resize(num_keys);
|
||||
|
||||
db_->MultiGet(read_opts, num_keys, handles.data(), keys.data(),
|
||||
values.data(), timestamps.data(), statuses.data());
|
||||
|
||||
for (int i = 0; i < num_keys; i++) {
|
||||
ASSERT_OK(statuses[i]);
|
||||
ASSERT_EQ(expected_values[i], values[i].ToString());
|
||||
ASSERT_EQ(ts, timestamps[i]);
|
||||
}
|
||||
}
|
||||
|
||||
delete handle;
|
||||
Close();
|
||||
}
|
||||
|
||||
TEST_F(DBBasicTestWithTimestamp, CompactRangeWithSpecifiedRange) {
|
||||
Options options = CurrentOptions();
|
||||
options.env = env_;
|
||||
|
|
|
@ -330,17 +330,16 @@ inline Slice ExtractUserKey(const Slice& internal_key) {
|
|||
// output : <user_provided_key>
|
||||
inline Slice ExtractUserKeyAndStripTimestamp(const Slice& internal_key,
|
||||
size_t ts_sz) {
|
||||
Slice ret = internal_key;
|
||||
ret.remove_suffix(kNumInternalBytes + ts_sz);
|
||||
return ret;
|
||||
assert(internal_key.size() >= kNumInternalBytes + ts_sz);
|
||||
return Slice(internal_key.data(),
|
||||
internal_key.size() - (kNumInternalBytes + ts_sz));
|
||||
}
|
||||
|
||||
// input [user key]: <user_provided_key | ts>
|
||||
// output: <user_provided_key>
|
||||
inline Slice StripTimestampFromUserKey(const Slice& user_key, size_t ts_sz) {
|
||||
Slice ret = user_key;
|
||||
ret.remove_suffix(ts_sz);
|
||||
return ret;
|
||||
assert(user_key.size() >= ts_sz);
|
||||
return Slice(user_key.data(), user_key.size() - ts_sz);
|
||||
}
|
||||
|
||||
// input [user key]: <user_provided_key | ts>
|
||||
|
|
|
@ -124,6 +124,7 @@ void EventHelpers::LogAndNotifyTableFileCreationFinished(
|
|||
<< "comparator" << table_properties.comparator_name
|
||||
<< "user_defined_timestamps_persisted"
|
||||
<< table_properties.user_defined_timestamps_persisted
|
||||
<< "key_largest_seqno" << table_properties.key_largest_seqno
|
||||
<< "merge_operator" << table_properties.merge_operator_name
|
||||
<< "prefix_extractor_name"
|
||||
<< table_properties.prefix_extractor_name << "property_collectors"
|
||||
|
|
|
@ -114,7 +114,6 @@ Status ExternalSstFileIngestionJob::Prepare(
|
|||
const std::string path_inside_db = TableFileName(
|
||||
cfd_->ioptions()->cf_paths, f.fd.GetNumber(), f.fd.GetPathId());
|
||||
if (ingestion_options_.move_files) {
|
||||
assert(!ingestion_options_.allow_db_generated_files);
|
||||
status =
|
||||
fs_->LinkFile(path_outside_db, path_inside_db, IOOptions(), nullptr);
|
||||
if (status.ok()) {
|
||||
|
@ -627,7 +626,8 @@ void ExternalSstFileIngestionJob::Cleanup(const Status& status) {
|
|||
DeleteInternalFiles();
|
||||
consumed_seqno_count_ = 0;
|
||||
files_overlap_ = false;
|
||||
} else if (status.ok() && ingestion_options_.move_files) {
|
||||
} else if (status.ok() && ingestion_options_.move_files &&
|
||||
!ingestion_options_.allow_db_generated_files) {
|
||||
// The files were moved and added successfully, remove original file links
|
||||
for (IngestedFileInfo& f : files_to_ingest_) {
|
||||
Status s = fs_->DeleteFile(f.external_file_path, io_opts, nullptr);
|
||||
|
@ -914,9 +914,18 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
|
|||
} else if (!iter->status().ok()) {
|
||||
return iter->status();
|
||||
}
|
||||
if (ingestion_options_.allow_db_generated_files) {
|
||||
// Verify that all keys have seqno zero.
|
||||
// TODO: store largest seqno in table property and validate it instead.
|
||||
SequenceNumber largest_seqno =
|
||||
table_reader.get()->GetTableProperties()->key_largest_seqno;
|
||||
// UINT64_MAX means unknown and the file is generated before table property
|
||||
// `key_largest_seqno` is introduced.
|
||||
if (largest_seqno != UINT64_MAX && largest_seqno > 0) {
|
||||
return Status::Corruption(
|
||||
"External file has non zero largest sequence number " +
|
||||
std::to_string(largest_seqno));
|
||||
}
|
||||
if (ingestion_options_.allow_db_generated_files &&
|
||||
largest_seqno == UINT64_MAX) {
|
||||
// Need to verify that all keys have seqno zero.
|
||||
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
||||
Status pik_status =
|
||||
ParseInternalKey(iter->key(), &key, allow_data_in_errors);
|
||||
|
|
|
@ -674,10 +674,8 @@ class SstFileWriterCollector : public TablePropertiesCollector {
|
|||
|
||||
Status Finish(UserCollectedProperties* properties) override {
|
||||
std::string count = std::to_string(count_);
|
||||
*properties = UserCollectedProperties{
|
||||
{prefix_ + "_SstFileWriterCollector", "YES"},
|
||||
{prefix_ + "_Count", count},
|
||||
};
|
||||
properties->insert({prefix_ + "_SstFileWriterCollector", "YES"});
|
||||
properties->insert({prefix_ + "_Count", count});
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
|
@ -3727,13 +3725,14 @@ INSTANTIATE_TEST_CASE_P(ExternSSTFileLinkFailFallbackTest,
|
|||
std::make_tuple(true, true),
|
||||
std::make_tuple(false, false)));
|
||||
|
||||
class IngestDBGeneratedFileTest : public ExternalSSTFileTestBase,
|
||||
public ::testing::WithParamInterface<bool> {
|
||||
class IngestDBGeneratedFileTest
|
||||
: public ExternalSSTFileTestBase,
|
||||
public ::testing::WithParamInterface<std::tuple<bool, bool>> {
|
||||
public:
|
||||
IngestDBGeneratedFileTest() {
|
||||
ingest_opts.allow_db_generated_files = true;
|
||||
ingest_opts.move_files = false;
|
||||
ingest_opts.verify_checksums_before_ingest = GetParam();
|
||||
ingest_opts.move_files = std::get<0>(GetParam());
|
||||
ingest_opts.verify_checksums_before_ingest = std::get<1>(GetParam());
|
||||
ingest_opts.snapshot_consistency = false;
|
||||
}
|
||||
|
||||
|
@ -3742,9 +3741,16 @@ class IngestDBGeneratedFileTest : public ExternalSSTFileTestBase,
|
|||
};
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(BasicMultiConfig, IngestDBGeneratedFileTest,
|
||||
testing::Bool());
|
||||
testing::Combine(testing::Bool(), testing::Bool()));
|
||||
|
||||
TEST_P(IngestDBGeneratedFileTest, FailureCase) {
|
||||
if (encrypted_env_ && ingest_opts.move_files) {
|
||||
// FIXME: should fail ingestion or support this combination.
|
||||
ROCKSDB_GTEST_SKIP(
|
||||
"Encrypted env and move_files do not work together, as we reopen the "
|
||||
"file after linking it which appends an extra encryption prefix.");
|
||||
return;
|
||||
}
|
||||
// Ingesting overlapping data should always fail.
|
||||
do {
|
||||
SCOPED_TRACE("option_config_ = " + std::to_string(option_config_));
|
||||
|
@ -3778,6 +3784,7 @@ TEST_P(IngestDBGeneratedFileTest, FailureCase) {
|
|||
live_meta[0].relative_filename);
|
||||
// Ingesting a file whose boundary key has non-zero seqno.
|
||||
Status s = db_->IngestExternalFile(to_ingest_files, ingest_opts);
|
||||
// This error msg is from checking seqno of boundary keys.
|
||||
ASSERT_TRUE(
|
||||
s.ToString().find("External file has non zero sequence number") !=
|
||||
std::string::npos);
|
||||
|
@ -3824,10 +3831,9 @@ TEST_P(IngestDBGeneratedFileTest, FailureCase) {
|
|||
live_meta[0].directory + "/" + live_meta[0].relative_filename;
|
||||
s = db_->IngestExternalFile(to_ingest_files, ingest_opts);
|
||||
ASSERT_NOK(s);
|
||||
ASSERT_TRUE(
|
||||
s.ToString().find(
|
||||
"External file has a key with non zero sequence number") !=
|
||||
std::string::npos);
|
||||
// This error msg is from checking largest seqno in table property.
|
||||
ASSERT_TRUE(s.ToString().find("non zero largest sequence number") !=
|
||||
std::string::npos);
|
||||
db_->ReleaseSnapshot(snapshot);
|
||||
}
|
||||
|
||||
|
@ -3897,14 +3903,6 @@ TEST_P(IngestDBGeneratedFileTest, FailureCase) {
|
|||
ASSERT_TRUE(s.ToString().find(err) != std::string::npos);
|
||||
ASSERT_NOK(s);
|
||||
|
||||
ingest_opts.move_files = true;
|
||||
s = db_->IngestExternalFile(to_ingest_files, ingest_opts);
|
||||
ingest_opts.move_files = false;
|
||||
ASSERT_TRUE(
|
||||
s.ToString().find("Options move_files and allow_db_generated_files are "
|
||||
"not compatible") != std::string::npos);
|
||||
ASSERT_NOK(s);
|
||||
|
||||
ingest_opts.snapshot_consistency = false;
|
||||
ASSERT_OK(db_->IngestExternalFile(to_ingest_files, ingest_opts));
|
||||
db_->ReleaseSnapshot(snapshot);
|
||||
|
@ -3924,14 +3922,16 @@ TEST_P(IngestDBGeneratedFileTest, FailureCase) {
|
|||
|
||||
class IngestDBGeneratedFileTest2
|
||||
: public ExternalSSTFileTestBase,
|
||||
public ::testing::WithParamInterface<std::tuple<bool, bool, bool, bool>> {
|
||||
public ::testing::WithParamInterface<
|
||||
std::tuple<bool, bool, bool, bool, bool>> {
|
||||
public:
|
||||
IngestDBGeneratedFileTest2() = default;
|
||||
};
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(VaryingOptions, IngestDBGeneratedFileTest2,
|
||||
testing::Combine(testing::Bool(), testing::Bool(),
|
||||
testing::Bool(), testing::Bool()));
|
||||
testing::Bool(), testing::Bool(),
|
||||
testing::Bool()));
|
||||
|
||||
TEST_P(IngestDBGeneratedFileTest2, NotOverlapWithDB) {
|
||||
// Use a separate column family to sort some data, generate multiple SST
|
||||
|
@ -3939,11 +3939,11 @@ TEST_P(IngestDBGeneratedFileTest2, NotOverlapWithDB) {
|
|||
// to be ingested does not overlap with existing data.
|
||||
IngestExternalFileOptions ingest_opts;
|
||||
ingest_opts.allow_db_generated_files = true;
|
||||
ingest_opts.move_files = false;
|
||||
ingest_opts.snapshot_consistency = std::get<0>(GetParam());
|
||||
ingest_opts.allow_global_seqno = std::get<1>(GetParam());
|
||||
ingest_opts.allow_blocking_flush = std::get<2>(GetParam());
|
||||
ingest_opts.fail_if_not_bottommost_level = std::get<3>(GetParam());
|
||||
ingest_opts.move_files = std::get<4>(GetParam());
|
||||
|
||||
do {
|
||||
SCOPED_TRACE("option_config_ = " + std::to_string(option_config_));
|
||||
|
|
|
@ -1156,6 +1156,11 @@ void FlushJob::GetEffectiveCutoffUDTForPickedMemTables() {
|
|||
// Find the newest user-defined timestamps from all the flushed memtables.
|
||||
for (MemTable* m : mems_) {
|
||||
Slice table_newest_udt = m->GetNewestUDT();
|
||||
// Empty memtables can be legitimately created and flushed, for example
|
||||
// by error recovery flush attempts.
|
||||
if (table_newest_udt.empty()) {
|
||||
continue;
|
||||
}
|
||||
if (cutoff_udt_.empty() ||
|
||||
ucmp->CompareTimestamp(table_newest_udt, cutoff_udt_) > 0) {
|
||||
if (!cutoff_udt_.empty()) {
|
||||
|
|
|
@ -68,7 +68,8 @@ class FlushJobTestBase : public testing::Test {
|
|||
}
|
||||
|
||||
void NewDB() {
|
||||
ASSERT_OK(SetIdentityFile(WriteOptions(), env_, dbname_));
|
||||
ASSERT_OK(
|
||||
SetIdentityFile(WriteOptions(), env_, dbname_, Temperature::kUnknown));
|
||||
VersionEdit new_db;
|
||||
|
||||
new_db.SetLogNumber(0);
|
||||
|
@ -114,7 +115,8 @@ class FlushJobTestBase : public testing::Test {
|
|||
}
|
||||
ASSERT_OK(s);
|
||||
// Make "CURRENT" file that points to the new manifest file.
|
||||
s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr);
|
||||
s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1,
|
||||
Temperature::kUnknown, nullptr);
|
||||
ASSERT_OK(s);
|
||||
}
|
||||
|
||||
|
|
|
@ -354,13 +354,13 @@ TEST_F(EventListenerTest, OnSingleDBFlushTest) {
|
|||
}
|
||||
|
||||
TEST_F(EventListenerTest, MultiCF) {
|
||||
Options options;
|
||||
options.env = CurrentOptions().env;
|
||||
options.write_buffer_size = k110KB;
|
||||
#ifdef ROCKSDB_USING_THREAD_STATUS
|
||||
options.enable_thread_tracking = true;
|
||||
#endif // ROCKSDB_USING_THREAD_STATUS
|
||||
for (auto atomic_flush : {false, true}) {
|
||||
Options options;
|
||||
options.env = CurrentOptions().env;
|
||||
options.write_buffer_size = k110KB;
|
||||
#ifdef ROCKSDB_USING_THREAD_STATUS
|
||||
options.enable_thread_tracking = true;
|
||||
#endif // ROCKSDB_USING_THREAD_STATUS
|
||||
options.atomic_flush = atomic_flush;
|
||||
options.create_if_missing = true;
|
||||
DestroyAndReopen(options);
|
||||
|
|
106
db/memtable.cc
106
db/memtable.cc
|
@ -67,9 +67,10 @@ ImmutableMemTableOptions::ImmutableMemTableOptions(
|
|||
statistics(ioptions.stats),
|
||||
merge_operator(ioptions.merge_operator.get()),
|
||||
info_log(ioptions.logger),
|
||||
allow_data_in_errors(ioptions.allow_data_in_errors),
|
||||
protection_bytes_per_key(
|
||||
mutable_cf_options.memtable_protection_bytes_per_key) {}
|
||||
mutable_cf_options.memtable_protection_bytes_per_key),
|
||||
allow_data_in_errors(ioptions.allow_data_in_errors),
|
||||
paranoid_memory_checks(mutable_cf_options.paranoid_memory_checks) {}
|
||||
|
||||
MemTable::MemTable(const InternalKeyComparator& cmp,
|
||||
const ImmutableOptions& ioptions,
|
||||
|
@ -370,15 +371,17 @@ class MemTableIterator : public InternalIterator {
|
|||
: bloom_(nullptr),
|
||||
prefix_extractor_(mem.prefix_extractor_),
|
||||
comparator_(mem.comparator_),
|
||||
valid_(false),
|
||||
seqno_to_time_mapping_(seqno_to_time_mapping),
|
||||
arena_mode_(arena != nullptr),
|
||||
value_pinned_(
|
||||
!mem.GetImmutableMemTableOptions()->inplace_update_support),
|
||||
protection_bytes_per_key_(mem.moptions_.protection_bytes_per_key),
|
||||
status_(Status::OK()),
|
||||
logger_(mem.moptions_.info_log),
|
||||
ts_sz_(mem.ts_sz_) {
|
||||
ts_sz_(mem.ts_sz_),
|
||||
protection_bytes_per_key_(mem.moptions_.protection_bytes_per_key),
|
||||
valid_(false),
|
||||
value_pinned_(
|
||||
!mem.GetImmutableMemTableOptions()->inplace_update_support),
|
||||
arena_mode_(arena != nullptr),
|
||||
paranoid_memory_checks_(mem.moptions_.paranoid_memory_checks),
|
||||
allow_data_in_error(mem.moptions_.allow_data_in_errors) {
|
||||
if (use_range_del_table) {
|
||||
iter_ = mem.range_del_table_->GetIterator(arena);
|
||||
} else if (prefix_extractor_ != nullptr && !read_options.total_order_seek &&
|
||||
|
@ -406,6 +409,7 @@ class MemTableIterator : public InternalIterator {
|
|||
} else {
|
||||
delete iter_;
|
||||
}
|
||||
status_.PermitUncheckedError();
|
||||
}
|
||||
|
||||
#ifndef NDEBUG
|
||||
|
@ -415,10 +419,16 @@ class MemTableIterator : public InternalIterator {
|
|||
PinnedIteratorsManager* pinned_iters_mgr_ = nullptr;
|
||||
#endif
|
||||
|
||||
bool Valid() const override { return valid_ && status_.ok(); }
|
||||
bool Valid() const override {
|
||||
// If inner iter_ is not valid, then this iter should also not be valid.
|
||||
assert(iter_->Valid() || !(valid_ && status_.ok()));
|
||||
return valid_ && status_.ok();
|
||||
}
|
||||
|
||||
void Seek(const Slice& k) override {
|
||||
PERF_TIMER_GUARD(seek_on_memtable_time);
|
||||
PERF_COUNTER_ADD(seek_on_memtable_count, 1);
|
||||
status_ = Status::OK();
|
||||
if (bloom_) {
|
||||
// iterator should only use prefix bloom filter
|
||||
Slice user_k_without_ts(ExtractUserKeyAndStripTimestamp(k, ts_sz_));
|
||||
|
@ -433,13 +443,18 @@ class MemTableIterator : public InternalIterator {
|
|||
}
|
||||
}
|
||||
}
|
||||
iter_->Seek(k, nullptr);
|
||||
if (paranoid_memory_checks_) {
|
||||
status_ = iter_->SeekAndValidate(k, nullptr, allow_data_in_error);
|
||||
} else {
|
||||
iter_->Seek(k, nullptr);
|
||||
}
|
||||
valid_ = iter_->Valid();
|
||||
VerifyEntryChecksum();
|
||||
}
|
||||
void SeekForPrev(const Slice& k) override {
|
||||
PERF_TIMER_GUARD(seek_on_memtable_time);
|
||||
PERF_COUNTER_ADD(seek_on_memtable_count, 1);
|
||||
status_ = Status::OK();
|
||||
if (bloom_) {
|
||||
Slice user_k_without_ts(ExtractUserKeyAndStripTimestamp(k, ts_sz_));
|
||||
if (prefix_extractor_->InDomain(user_k_without_ts)) {
|
||||
|
@ -453,7 +468,11 @@ class MemTableIterator : public InternalIterator {
|
|||
}
|
||||
}
|
||||
}
|
||||
iter_->Seek(k, nullptr);
|
||||
if (paranoid_memory_checks_) {
|
||||
status_ = iter_->SeekAndValidate(k, nullptr, allow_data_in_error);
|
||||
} else {
|
||||
iter_->Seek(k, nullptr);
|
||||
}
|
||||
valid_ = iter_->Valid();
|
||||
VerifyEntryChecksum();
|
||||
if (!Valid() && status().ok()) {
|
||||
|
@ -464,11 +483,13 @@ class MemTableIterator : public InternalIterator {
|
|||
}
|
||||
}
|
||||
void SeekToFirst() override {
|
||||
status_ = Status::OK();
|
||||
iter_->SeekToFirst();
|
||||
valid_ = iter_->Valid();
|
||||
VerifyEntryChecksum();
|
||||
}
|
||||
void SeekToLast() override {
|
||||
status_ = Status::OK();
|
||||
iter_->SeekToLast();
|
||||
valid_ = iter_->Valid();
|
||||
VerifyEntryChecksum();
|
||||
|
@ -476,8 +497,12 @@ class MemTableIterator : public InternalIterator {
|
|||
void Next() override {
|
||||
PERF_COUNTER_ADD(next_on_memtable_count, 1);
|
||||
assert(Valid());
|
||||
iter_->Next();
|
||||
TEST_SYNC_POINT_CALLBACK("MemTableIterator::Next:0", iter_);
|
||||
if (paranoid_memory_checks_) {
|
||||
status_ = iter_->NextAndValidate(allow_data_in_error);
|
||||
} else {
|
||||
iter_->Next();
|
||||
TEST_SYNC_POINT_CALLBACK("MemTableIterator::Next:0", iter_);
|
||||
}
|
||||
valid_ = iter_->Valid();
|
||||
VerifyEntryChecksum();
|
||||
}
|
||||
|
@ -494,7 +519,11 @@ class MemTableIterator : public InternalIterator {
|
|||
void Prev() override {
|
||||
PERF_COUNTER_ADD(prev_on_memtable_count, 1);
|
||||
assert(Valid());
|
||||
iter_->Prev();
|
||||
if (paranoid_memory_checks_) {
|
||||
status_ = iter_->PrevAndValidate(allow_data_in_error);
|
||||
} else {
|
||||
iter_->Prev();
|
||||
}
|
||||
valid_ = iter_->Valid();
|
||||
VerifyEntryChecksum();
|
||||
}
|
||||
|
@ -540,15 +569,17 @@ class MemTableIterator : public InternalIterator {
|
|||
const SliceTransform* const prefix_extractor_;
|
||||
const MemTable::KeyComparator comparator_;
|
||||
MemTableRep::Iterator* iter_;
|
||||
bool valid_;
|
||||
// The seqno to time mapping is owned by the SuperVersion.
|
||||
UnownedPtr<const SeqnoToTimeMapping> seqno_to_time_mapping_;
|
||||
bool arena_mode_;
|
||||
bool value_pinned_;
|
||||
uint32_t protection_bytes_per_key_;
|
||||
Status status_;
|
||||
Logger* logger_;
|
||||
size_t ts_sz_;
|
||||
uint32_t protection_bytes_per_key_;
|
||||
bool valid_;
|
||||
bool value_pinned_;
|
||||
bool arena_mode_;
|
||||
const bool paranoid_memory_checks_;
|
||||
const bool allow_data_in_error;
|
||||
|
||||
void VerifyEntryChecksum() {
|
||||
if (protection_bytes_per_key_ > 0 && Valid()) {
|
||||
|
@ -933,6 +964,8 @@ static bool SaveValue(void* arg, const char* entry) {
|
|||
Saver* s = static_cast<Saver*>(arg);
|
||||
assert(s != nullptr);
|
||||
assert(!s->value || !s->columns);
|
||||
assert(!*(s->found_final_value));
|
||||
assert(s->status->ok() || s->status->IsMergeInProgress());
|
||||
|
||||
MergeContext* merge_context = s->merge_context;
|
||||
SequenceNumber max_covering_tombstone_seq = s->max_covering_tombstone_seq;
|
||||
|
@ -966,6 +999,7 @@ static bool SaveValue(void* arg, const char* entry) {
|
|||
*(s->status) = MemTable::VerifyEntryChecksum(
|
||||
entry, s->protection_bytes_per_key, s->allow_data_in_errors);
|
||||
if (!s->status->ok()) {
|
||||
*(s->found_final_value) = true;
|
||||
ROCKS_LOG_ERROR(s->logger, "In SaveValue: %s", s->status->getState());
|
||||
// Memtable entry corrupted
|
||||
return false;
|
||||
|
@ -1231,6 +1265,7 @@ static bool SaveValue(void* arg, const char* entry) {
|
|||
". ");
|
||||
msg.append("seq: " + std::to_string(seq) + ".");
|
||||
}
|
||||
*(s->found_final_value) = true;
|
||||
*(s->status) = Status::Corruption(msg.c_str());
|
||||
return false;
|
||||
}
|
||||
|
@ -1310,8 +1345,12 @@ bool MemTable::Get(const LookupKey& key, std::string* value,
|
|||
|
||||
// No change to value, since we have not yet found a Put/Delete
|
||||
// Propagate corruption error
|
||||
if (!found_final_value && merge_in_progress && !s->IsCorruption()) {
|
||||
*s = Status::MergeInProgress();
|
||||
if (!found_final_value && merge_in_progress) {
|
||||
if (s->ok()) {
|
||||
*s = Status::MergeInProgress();
|
||||
} else {
|
||||
assert(s->IsMergeInProgress());
|
||||
}
|
||||
}
|
||||
PERF_COUNTER_ADD(get_from_memtable_count, 1);
|
||||
return found_final_value;
|
||||
|
@ -1347,7 +1386,19 @@ void MemTable::GetFromTable(const LookupKey& key,
|
|||
saver.do_merge = do_merge;
|
||||
saver.allow_data_in_errors = moptions_.allow_data_in_errors;
|
||||
saver.protection_bytes_per_key = moptions_.protection_bytes_per_key;
|
||||
table_->Get(key, &saver, SaveValue);
|
||||
|
||||
if (!moptions_.paranoid_memory_checks) {
|
||||
table_->Get(key, &saver, SaveValue);
|
||||
} else {
|
||||
Status check_s = table_->GetAndValidate(key, &saver, SaveValue,
|
||||
moptions_.allow_data_in_errors);
|
||||
if (check_s.IsCorruption()) {
|
||||
*(saver.status) = check_s;
|
||||
// Should stop searching the LSM.
|
||||
*(saver.found_final_value) = true;
|
||||
}
|
||||
}
|
||||
assert(s->ok() || s->IsMergeInProgress() || *found_final_value);
|
||||
*seq = saver.seq;
|
||||
}
|
||||
|
||||
|
@ -1421,10 +1472,19 @@ void MemTable::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
|
|||
&found_final_value, &merge_in_progress);
|
||||
|
||||
if (!found_final_value && merge_in_progress) {
|
||||
*(iter->s) = Status::MergeInProgress();
|
||||
if (iter->s->ok()) {
|
||||
*(iter->s) = Status::MergeInProgress();
|
||||
} else {
|
||||
assert(iter->s->IsMergeInProgress());
|
||||
}
|
||||
}
|
||||
|
||||
if (found_final_value) {
|
||||
if (found_final_value ||
|
||||
(!iter->s->ok() && !iter->s->IsMergeInProgress())) {
|
||||
// `found_final_value` should be set if an error/corruption occurs.
|
||||
// The check on iter->s is just there in case GetFromTable() did not
|
||||
// set `found_final_value` properly.
|
||||
assert(found_final_value);
|
||||
if (iter->value) {
|
||||
iter->value->PinSelf();
|
||||
range->AddValueSize(iter->value->size());
|
||||
|
|
|
@ -60,8 +60,9 @@ struct ImmutableMemTableOptions {
|
|||
Statistics* statistics;
|
||||
MergeOperator* merge_operator;
|
||||
Logger* info_log;
|
||||
bool allow_data_in_errors;
|
||||
uint32_t protection_bytes_per_key;
|
||||
bool allow_data_in_errors;
|
||||
bool paranoid_memory_checks;
|
||||
};
|
||||
|
||||
// Batched counters to updated when inserting keys in one write batch.
|
||||
|
@ -249,12 +250,14 @@ class MemTable {
|
|||
// If do_merge = true the default behavior which is Get value for key is
|
||||
// executed. Expected behavior is described right below.
|
||||
// If memtable contains a value for key, store it in *value and return true.
|
||||
// If memtable contains a deletion for key, store a NotFound() error
|
||||
// in *status and return true.
|
||||
// If memtable contains a deletion for key, store NotFound() in *status and
|
||||
// return true.
|
||||
// If memtable contains Merge operation as the most recent entry for a key,
|
||||
// and the merge process does not stop (not reaching a value or delete),
|
||||
// prepend the current merge operand to *operands.
|
||||
// store MergeInProgress in s, and return false.
|
||||
// If an unexpected error or corruption occurs, store Corruption() or other
|
||||
// error in *status and return true.
|
||||
// Else, return false.
|
||||
// If any operation was found, its most recent sequence number
|
||||
// will be stored in *seq on success (regardless of whether true/false is
|
||||
|
@ -264,6 +267,11 @@ class MemTable {
|
|||
// If do_merge = false then any Merge Operands encountered for key are simply
|
||||
// stored in merge_context.operands_list and never actually merged to get a
|
||||
// final value. The raw Merge Operands are eventually returned to the user.
|
||||
// @param value If not null and memtable contains a value for key, `value`
|
||||
// will be set to the result value.
|
||||
// @param column If not null and memtable contains a value/WideColumn for key,
|
||||
// `column` will be set to the result value/WideColumn.
|
||||
// Note: only one of `value` and `column` can be non-nullptr.
|
||||
// @param immutable_memtable Whether this memtable is immutable. Used
|
||||
// internally by NewRangeTombstoneIterator(). See comment above
|
||||
// NewRangeTombstoneIterator() for more detail.
|
||||
|
|
|
@ -181,7 +181,8 @@ bool MemTableListVersion::GetFromList(
|
|||
}
|
||||
|
||||
if (done) {
|
||||
assert(*seq != kMaxSequenceNumber || s->IsNotFound());
|
||||
assert(*seq != kMaxSequenceNumber ||
|
||||
(!s->ok() && !s->IsMergeInProgress()));
|
||||
return true;
|
||||
}
|
||||
if (!s->ok() && !s->IsMergeInProgress() && !s->IsNotFound()) {
|
||||
|
|
|
@ -287,6 +287,7 @@ TEST_F(MemTableListTest, GetTest) {
|
|||
|
||||
// Fetch the newly written keys
|
||||
merge_context.Clear();
|
||||
s = Status::OK();
|
||||
found = mem->Get(LookupKey("key1", seq), &value, /*columns*/ nullptr,
|
||||
/*timestamp*/ nullptr, &s, &merge_context,
|
||||
&max_covering_tombstone_seq, ReadOptions(),
|
||||
|
@ -295,6 +296,7 @@ TEST_F(MemTableListTest, GetTest) {
|
|||
ASSERT_EQ(value, "value1");
|
||||
|
||||
merge_context.Clear();
|
||||
s = Status::OK();
|
||||
found = mem->Get(LookupKey("key1", 2), &value, /*columns*/ nullptr,
|
||||
/*timestamp*/ nullptr, &s, &merge_context,
|
||||
&max_covering_tombstone_seq, ReadOptions(),
|
||||
|
@ -303,6 +305,7 @@ TEST_F(MemTableListTest, GetTest) {
|
|||
ASSERT_TRUE(found && s.IsNotFound());
|
||||
|
||||
merge_context.Clear();
|
||||
s = Status::OK();
|
||||
found = mem->Get(LookupKey("key2", seq), &value, /*columns*/ nullptr,
|
||||
/*timestamp*/ nullptr, &s, &merge_context,
|
||||
&max_covering_tombstone_seq, ReadOptions(),
|
||||
|
@ -311,6 +314,7 @@ TEST_F(MemTableListTest, GetTest) {
|
|||
ASSERT_EQ(value, "value2.2");
|
||||
|
||||
merge_context.Clear();
|
||||
s = Status::OK();
|
||||
found = mem->Get(LookupKey("key3", seq), &value, /*columns*/ nullptr,
|
||||
/*timestamp*/ nullptr, &s, &merge_context,
|
||||
&max_covering_tombstone_seq, ReadOptions(),
|
||||
|
@ -350,6 +354,7 @@ TEST_F(MemTableListTest, GetTest) {
|
|||
|
||||
// Fetch keys via MemTableList
|
||||
merge_context.Clear();
|
||||
s = Status::OK();
|
||||
found =
|
||||
list.current()->Get(LookupKey("key1", seq), &value, /*columns=*/nullptr,
|
||||
/*timestamp=*/nullptr, &s, &merge_context,
|
||||
|
@ -357,6 +362,7 @@ TEST_F(MemTableListTest, GetTest) {
|
|||
ASSERT_TRUE(found && s.IsNotFound());
|
||||
|
||||
merge_context.Clear();
|
||||
s = Status::OK();
|
||||
found = list.current()->Get(LookupKey("key1", saved_seq), &value,
|
||||
/*columns=*/nullptr, /*timestamp=*/nullptr, &s,
|
||||
&merge_context, &max_covering_tombstone_seq,
|
||||
|
@ -365,6 +371,7 @@ TEST_F(MemTableListTest, GetTest) {
|
|||
ASSERT_EQ("value1", value);
|
||||
|
||||
merge_context.Clear();
|
||||
s = Status::OK();
|
||||
found =
|
||||
list.current()->Get(LookupKey("key2", seq), &value, /*columns=*/nullptr,
|
||||
/*timestamp=*/nullptr, &s, &merge_context,
|
||||
|
@ -373,12 +380,14 @@ TEST_F(MemTableListTest, GetTest) {
|
|||
ASSERT_EQ(value, "value2.3");
|
||||
|
||||
merge_context.Clear();
|
||||
s = Status::OK();
|
||||
found = list.current()->Get(LookupKey("key2", 1), &value, /*columns=*/nullptr,
|
||||
/*timestamp=*/nullptr, &s, &merge_context,
|
||||
&max_covering_tombstone_seq, ReadOptions());
|
||||
ASSERT_FALSE(found);
|
||||
|
||||
merge_context.Clear();
|
||||
s = Status::OK();
|
||||
found =
|
||||
list.current()->Get(LookupKey("key3", seq), &value, /*columns=*/nullptr,
|
||||
/*timestamp=*/nullptr, &s, &merge_context,
|
||||
|
@ -438,6 +447,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
|
|||
|
||||
// Fetch the newly written keys
|
||||
merge_context.Clear();
|
||||
s = Status::OK();
|
||||
found = mem->Get(LookupKey("key1", seq), &value, /*columns*/ nullptr,
|
||||
/*timestamp*/ nullptr, &s, &merge_context,
|
||||
&max_covering_tombstone_seq, ReadOptions(),
|
||||
|
@ -446,6 +456,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
|
|||
ASSERT_TRUE(found && s.IsNotFound());
|
||||
|
||||
merge_context.Clear();
|
||||
s = Status::OK();
|
||||
found = mem->Get(LookupKey("key2", seq), &value, /*columns*/ nullptr,
|
||||
/*timestamp*/ nullptr, &s, &merge_context,
|
||||
&max_covering_tombstone_seq, ReadOptions(),
|
||||
|
@ -462,6 +473,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
|
|||
|
||||
// Fetch keys via MemTableList
|
||||
merge_context.Clear();
|
||||
s = Status::OK();
|
||||
found =
|
||||
list.current()->Get(LookupKey("key1", seq), &value, /*columns=*/nullptr,
|
||||
/*timestamp=*/nullptr, &s, &merge_context,
|
||||
|
@ -469,6 +481,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
|
|||
ASSERT_TRUE(found && s.IsNotFound());
|
||||
|
||||
merge_context.Clear();
|
||||
s = Status::OK();
|
||||
found =
|
||||
list.current()->Get(LookupKey("key2", seq), &value, /*columns=*/nullptr,
|
||||
/*timestamp=*/nullptr, &s, &merge_context,
|
||||
|
@ -508,6 +521,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
|
|||
|
||||
// Verify keys are present in history
|
||||
merge_context.Clear();
|
||||
s = Status::OK();
|
||||
found = list.current()->GetFromHistory(
|
||||
LookupKey("key1", seq), &value, /*columns=*/nullptr,
|
||||
/*timestamp=*/nullptr, &s, &merge_context, &max_covering_tombstone_seq,
|
||||
|
@ -515,6 +529,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
|
|||
ASSERT_TRUE(found && s.IsNotFound());
|
||||
|
||||
merge_context.Clear();
|
||||
s = Status::OK();
|
||||
found = list.current()->GetFromHistory(
|
||||
LookupKey("key2", seq), &value, /*columns=*/nullptr,
|
||||
/*timestamp=*/nullptr, &s, &merge_context, &max_covering_tombstone_seq,
|
||||
|
@ -568,6 +583,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
|
|||
|
||||
// Verify keys are no longer in MemTableList
|
||||
merge_context.Clear();
|
||||
s = Status::OK();
|
||||
found =
|
||||
list.current()->Get(LookupKey("key1", seq), &value, /*columns=*/nullptr,
|
||||
/*timestamp=*/nullptr, &s, &merge_context,
|
||||
|
@ -575,6 +591,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
|
|||
ASSERT_FALSE(found);
|
||||
|
||||
merge_context.Clear();
|
||||
s = Status::OK();
|
||||
found =
|
||||
list.current()->Get(LookupKey("key2", seq), &value, /*columns=*/nullptr,
|
||||
/*timestamp=*/nullptr, &s, &merge_context,
|
||||
|
@ -582,6 +599,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
|
|||
ASSERT_FALSE(found);
|
||||
|
||||
merge_context.Clear();
|
||||
s = Status::OK();
|
||||
found =
|
||||
list.current()->Get(LookupKey("key3", seq), &value, /*columns=*/nullptr,
|
||||
/*timestamp=*/nullptr, &s, &merge_context,
|
||||
|
@ -590,6 +608,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
|
|||
|
||||
// Verify that the second memtable's keys are in the history
|
||||
merge_context.Clear();
|
||||
s = Status::OK();
|
||||
found = list.current()->GetFromHistory(
|
||||
LookupKey("key1", seq), &value, /*columns=*/nullptr,
|
||||
/*timestamp=*/nullptr, &s, &merge_context, &max_covering_tombstone_seq,
|
||||
|
@ -597,6 +616,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
|
|||
ASSERT_TRUE(found && s.IsNotFound());
|
||||
|
||||
merge_context.Clear();
|
||||
s = Status::OK();
|
||||
found = list.current()->GetFromHistory(
|
||||
LookupKey("key3", seq), &value, /*columns=*/nullptr,
|
||||
/*timestamp=*/nullptr, &s, &merge_context, &max_covering_tombstone_seq,
|
||||
|
@ -606,6 +626,7 @@ TEST_F(MemTableListTest, GetFromHistoryTest) {
|
|||
|
||||
// Verify that key2 from the first memtable is no longer in the history
|
||||
merge_context.Clear();
|
||||
s = Status::OK();
|
||||
found =
|
||||
list.current()->Get(LookupKey("key2", seq), &value, /*columns=*/nullptr,
|
||||
/*timestamp=*/nullptr, &s, &merge_context,
|
||||
|
|
|
@ -29,6 +29,7 @@
|
|||
#include "db/internal_stats.h"
|
||||
#include "db/table_cache.h"
|
||||
#include "db/version_edit.h"
|
||||
#include "db/version_edit_handler.h"
|
||||
#include "db/version_set.h"
|
||||
#include "port/port.h"
|
||||
#include "table/table_reader.h"
|
||||
|
@ -37,6 +38,25 @@
|
|||
namespace ROCKSDB_NAMESPACE {
|
||||
|
||||
class VersionBuilder::Rep {
|
||||
class NewestFirstBySeqNo {
|
||||
public:
|
||||
bool operator()(const FileMetaData* lhs, const FileMetaData* rhs) const {
|
||||
assert(lhs);
|
||||
assert(rhs);
|
||||
|
||||
if (lhs->fd.largest_seqno != rhs->fd.largest_seqno) {
|
||||
return lhs->fd.largest_seqno > rhs->fd.largest_seqno;
|
||||
}
|
||||
|
||||
if (lhs->fd.smallest_seqno != rhs->fd.smallest_seqno) {
|
||||
return lhs->fd.smallest_seqno > rhs->fd.smallest_seqno;
|
||||
}
|
||||
|
||||
// Break ties by file number
|
||||
return lhs->fd.GetNumber() > rhs->fd.GetNumber();
|
||||
}
|
||||
};
|
||||
|
||||
class NewestFirstByEpochNumber {
|
||||
private:
|
||||
inline static const NewestFirstBySeqNo seqno_cmp;
|
||||
|
@ -249,9 +269,10 @@ class VersionBuilder::Rep {
|
|||
std::unordered_map<uint64_t, int> table_file_levels_;
|
||||
// Current compact cursors that should be changed after the last compaction
|
||||
std::unordered_map<int, InternalKey> updated_compact_cursors_;
|
||||
NewestFirstByEpochNumber level_zero_cmp_by_epochno_;
|
||||
NewestFirstBySeqNo level_zero_cmp_by_seqno_;
|
||||
BySmallestKey level_nonzero_cmp_;
|
||||
const std::shared_ptr<const NewestFirstByEpochNumber>
|
||||
level_zero_cmp_by_epochno_;
|
||||
const std::shared_ptr<const NewestFirstBySeqNo> level_zero_cmp_by_seqno_;
|
||||
const std::shared_ptr<const BySmallestKey> level_nonzero_cmp_;
|
||||
|
||||
// Mutable metadata objects for all blob files affected by the series of
|
||||
// version edits.
|
||||
|
@ -259,11 +280,56 @@ class VersionBuilder::Rep {
|
|||
|
||||
std::shared_ptr<CacheReservationManager> file_metadata_cache_res_mgr_;
|
||||
|
||||
ColumnFamilyData* cfd_;
|
||||
VersionEditHandler* version_edit_handler_;
|
||||
bool track_found_and_missing_files_;
|
||||
// If false, only a complete Version with all files consisting it found is
|
||||
// considered valid. If true, besides complete Version, if the Version is
|
||||
// never edited in an atomic group, an incomplete Version with only a suffix
|
||||
// of L0 files missing is also considered valid.
|
||||
bool allow_incomplete_valid_version_;
|
||||
|
||||
// These are only tracked if `track_found_and_missing_files_` is enabled.
|
||||
|
||||
// The SST files that are found (blob files not included yet).
|
||||
std::unordered_set<uint64_t> found_files_;
|
||||
// Missing SST files for L0
|
||||
std::unordered_set<uint64_t> l0_missing_files_;
|
||||
// Missing SST files for non L0 levels
|
||||
std::unordered_set<uint64_t> non_l0_missing_files_;
|
||||
// Intermediate SST files (blob files not included yet)
|
||||
std::vector<std::string> intermediate_files_;
|
||||
// The highest file number for all the missing blob files, useful to check
|
||||
// if a complete Version is available.
|
||||
uint64_t missing_blob_files_high_ = kInvalidBlobFileNumber;
|
||||
// Missing blob files, useful to check if only the missing L0 files'
|
||||
// associated blob files are missing.
|
||||
std::unordered_set<uint64_t> missing_blob_files_;
|
||||
// True if all files consisting the Version can be found. Or if
|
||||
// `allow_incomplete_valid_version_` is true and the version history is not
|
||||
// ever edited in an atomic group, this will be true if only a
|
||||
// suffix of L0 SST files and their associated blob files are missing.
|
||||
bool valid_version_available_;
|
||||
// True if version is ever edited in an atomic group.
|
||||
bool edited_in_atomic_group_;
|
||||
|
||||
// Flag to indicate if the Version is updated since last validity check. If no
|
||||
// `Apply` call is made between a `Rep`'s construction and a
|
||||
// `ValidVersionAvailable` check or between two `ValidVersionAvailable` calls.
|
||||
// This flag will be true to indicate the cached validity value can be
|
||||
// directly used without a recheck.
|
||||
bool version_updated_since_last_check_;
|
||||
|
||||
// End of fields that are only tracked when `track_found_and_missing_files_`
|
||||
// is enabled.
|
||||
|
||||
public:
|
||||
Rep(const FileOptions& file_options, const ImmutableCFOptions* ioptions,
|
||||
TableCache* table_cache, VersionStorageInfo* base_vstorage,
|
||||
VersionSet* version_set,
|
||||
std::shared_ptr<CacheReservationManager> file_metadata_cache_res_mgr)
|
||||
std::shared_ptr<CacheReservationManager> file_metadata_cache_res_mgr,
|
||||
ColumnFamilyData* cfd, VersionEditHandler* version_edit_handler,
|
||||
bool track_found_and_missing_files, bool allow_incomplete_valid_version)
|
||||
: file_options_(file_options),
|
||||
ioptions_(ioptions),
|
||||
table_cache_(table_cache),
|
||||
|
@ -271,11 +337,76 @@ class VersionBuilder::Rep {
|
|||
version_set_(version_set),
|
||||
num_levels_(base_vstorage->num_levels()),
|
||||
has_invalid_levels_(false),
|
||||
level_nonzero_cmp_(base_vstorage_->InternalComparator()),
|
||||
file_metadata_cache_res_mgr_(file_metadata_cache_res_mgr) {
|
||||
level_zero_cmp_by_epochno_(
|
||||
std::make_shared<NewestFirstByEpochNumber>()),
|
||||
level_zero_cmp_by_seqno_(std::make_shared<NewestFirstBySeqNo>()),
|
||||
level_nonzero_cmp_(std::make_shared<BySmallestKey>(
|
||||
base_vstorage_->InternalComparator())),
|
||||
file_metadata_cache_res_mgr_(file_metadata_cache_res_mgr),
|
||||
cfd_(cfd),
|
||||
version_edit_handler_(version_edit_handler),
|
||||
track_found_and_missing_files_(track_found_and_missing_files),
|
||||
allow_incomplete_valid_version_(allow_incomplete_valid_version) {
|
||||
assert(ioptions_);
|
||||
|
||||
levels_ = new LevelState[num_levels_];
|
||||
if (track_found_and_missing_files_) {
|
||||
assert(cfd_);
|
||||
assert(version_edit_handler_);
|
||||
// `track_found_and_missing_files_` mode used by VersionEditHandlerPIT
|
||||
// assumes the initial base version is valid. For best efforts recovery,
|
||||
// base will be empty. For manifest tailing usage like secondary instance,
|
||||
// they do not allow incomplete version, so the base version in subsequent
|
||||
// catch up attempts should be valid too.
|
||||
valid_version_available_ = true;
|
||||
edited_in_atomic_group_ = false;
|
||||
version_updated_since_last_check_ = false;
|
||||
}
|
||||
}
|
||||
|
||||
Rep(const Rep& other)
|
||||
: file_options_(other.file_options_),
|
||||
ioptions_(other.ioptions_),
|
||||
table_cache_(other.table_cache_),
|
||||
base_vstorage_(other.base_vstorage_),
|
||||
version_set_(other.version_set_),
|
||||
num_levels_(other.num_levels_),
|
||||
invalid_level_sizes_(other.invalid_level_sizes_),
|
||||
has_invalid_levels_(other.has_invalid_levels_),
|
||||
table_file_levels_(other.table_file_levels_),
|
||||
updated_compact_cursors_(other.updated_compact_cursors_),
|
||||
level_zero_cmp_by_epochno_(other.level_zero_cmp_by_epochno_),
|
||||
level_zero_cmp_by_seqno_(other.level_zero_cmp_by_seqno_),
|
||||
level_nonzero_cmp_(other.level_nonzero_cmp_),
|
||||
mutable_blob_file_metas_(other.mutable_blob_file_metas_),
|
||||
file_metadata_cache_res_mgr_(other.file_metadata_cache_res_mgr_),
|
||||
cfd_(other.cfd_),
|
||||
version_edit_handler_(other.version_edit_handler_),
|
||||
track_found_and_missing_files_(other.track_found_and_missing_files_),
|
||||
allow_incomplete_valid_version_(other.allow_incomplete_valid_version_),
|
||||
found_files_(other.found_files_),
|
||||
l0_missing_files_(other.l0_missing_files_),
|
||||
non_l0_missing_files_(other.non_l0_missing_files_),
|
||||
intermediate_files_(other.intermediate_files_),
|
||||
missing_blob_files_high_(other.missing_blob_files_high_),
|
||||
missing_blob_files_(other.missing_blob_files_),
|
||||
valid_version_available_(other.valid_version_available_),
|
||||
edited_in_atomic_group_(other.edited_in_atomic_group_),
|
||||
version_updated_since_last_check_(
|
||||
other.version_updated_since_last_check_) {
|
||||
assert(ioptions_);
|
||||
levels_ = new LevelState[num_levels_];
|
||||
for (int level = 0; level < num_levels_; level++) {
|
||||
levels_[level] = other.levels_[level];
|
||||
const auto& added = levels_[level].added_files;
|
||||
for (auto& pair : added) {
|
||||
RefFile(pair.second);
|
||||
}
|
||||
}
|
||||
if (track_found_and_missing_files_) {
|
||||
assert(cfd_);
|
||||
assert(version_edit_handler_);
|
||||
}
|
||||
}
|
||||
|
||||
~Rep() {
|
||||
|
@ -289,6 +420,12 @@ class VersionBuilder::Rep {
|
|||
delete[] levels_;
|
||||
}
|
||||
|
||||
void RefFile(FileMetaData* f) {
|
||||
assert(f);
|
||||
assert(f->refs > 0);
|
||||
f->refs++;
|
||||
}
|
||||
|
||||
void UnrefFile(FileMetaData* f) {
|
||||
f->refs--;
|
||||
if (f->refs <= 0) {
|
||||
|
@ -397,7 +534,7 @@ class VersionBuilder::Rep {
|
|||
|
||||
if (epoch_number_requirement ==
|
||||
EpochNumberRequirement::kMightMissing) {
|
||||
if (!level_zero_cmp_by_seqno_(lhs, rhs)) {
|
||||
if (!level_zero_cmp_by_seqno_->operator()(lhs, rhs)) {
|
||||
std::ostringstream oss;
|
||||
oss << "L0 files are not sorted properly: files #"
|
||||
<< lhs->fd.GetNumber() << " with seqnos (largest, smallest) "
|
||||
|
@ -429,7 +566,7 @@ class VersionBuilder::Rep {
|
|||
}
|
||||
}
|
||||
|
||||
if (!level_zero_cmp_by_epochno_(lhs, rhs)) {
|
||||
if (!level_zero_cmp_by_epochno_->operator()(lhs, rhs)) {
|
||||
std::ostringstream oss;
|
||||
oss << "L0 files are not sorted properly: files #"
|
||||
<< lhs->fd.GetNumber() << " with epoch number "
|
||||
|
@ -458,7 +595,7 @@ class VersionBuilder::Rep {
|
|||
assert(lhs);
|
||||
assert(rhs);
|
||||
|
||||
if (!level_nonzero_cmp_(lhs, rhs)) {
|
||||
if (!level_nonzero_cmp_->operator()(lhs, rhs)) {
|
||||
std::ostringstream oss;
|
||||
oss << 'L' << level << " files are not sorted properly: files #"
|
||||
<< lhs->fd.GetNumber() << ", #" << rhs->fd.GetNumber();
|
||||
|
@ -634,7 +771,22 @@ class VersionBuilder::Rep {
|
|||
mutable_blob_file_metas_.emplace(
|
||||
blob_file_number, MutableBlobFileMetaData(std::move(shared_meta)));
|
||||
|
||||
return Status::OK();
|
||||
Status s;
|
||||
if (track_found_and_missing_files_) {
|
||||
assert(version_edit_handler_);
|
||||
s = version_edit_handler_->VerifyBlobFile(cfd_, blob_file_number,
|
||||
blob_file_addition);
|
||||
if (s.IsPathNotFound() || s.IsNotFound() || s.IsCorruption()) {
|
||||
missing_blob_files_high_ =
|
||||
std::max(missing_blob_files_high_, blob_file_number);
|
||||
missing_blob_files_.insert(blob_file_number);
|
||||
s = Status::OK();
|
||||
} else if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
}
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
Status ApplyBlobFileGarbage(const BlobFileGarbage& blob_file_garbage) {
|
||||
|
@ -752,6 +904,29 @@ class VersionBuilder::Rep {
|
|||
table_file_levels_[file_number] =
|
||||
VersionStorageInfo::FileLocation::Invalid().GetLevel();
|
||||
|
||||
if (track_found_and_missing_files_) {
|
||||
assert(version_edit_handler_);
|
||||
if (l0_missing_files_.find(file_number) != l0_missing_files_.end()) {
|
||||
l0_missing_files_.erase(file_number);
|
||||
} else if (non_l0_missing_files_.find(file_number) !=
|
||||
non_l0_missing_files_.end()) {
|
||||
non_l0_missing_files_.erase(file_number);
|
||||
} else {
|
||||
auto fiter = found_files_.find(file_number);
|
||||
// Only mark new files added during this catchup attempt for deletion.
|
||||
// These files were never installed in VersionStorageInfo.
|
||||
// Already referenced files that are deleted by a VersionEdit will
|
||||
// be added to the VersionStorageInfo's obsolete files when the old
|
||||
// version is dereferenced.
|
||||
if (fiter != found_files_.end()) {
|
||||
assert(!ioptions_->cf_paths.empty());
|
||||
intermediate_files_.emplace_back(
|
||||
MakeTableFileName(ioptions_->cf_paths[0].path, file_number));
|
||||
found_files_.erase(fiter);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
|
@ -824,7 +999,31 @@ class VersionBuilder::Rep {
|
|||
|
||||
table_file_levels_[file_number] = level;
|
||||
|
||||
return Status::OK();
|
||||
Status s;
|
||||
if (track_found_and_missing_files_) {
|
||||
assert(version_edit_handler_);
|
||||
assert(!ioptions_->cf_paths.empty());
|
||||
const std::string fpath =
|
||||
MakeTableFileName(ioptions_->cf_paths[0].path, file_number);
|
||||
s = version_edit_handler_->VerifyFile(cfd_, fpath, level, meta);
|
||||
if (s.IsPathNotFound() || s.IsNotFound() || s.IsCorruption()) {
|
||||
if (0 == level) {
|
||||
l0_missing_files_.insert(file_number);
|
||||
} else {
|
||||
non_l0_missing_files_.insert(file_number);
|
||||
}
|
||||
if (s.IsCorruption()) {
|
||||
found_files_.insert(file_number);
|
||||
}
|
||||
s = Status::OK();
|
||||
} else if (!s.ok()) {
|
||||
return s;
|
||||
} else {
|
||||
found_files_.insert(file_number);
|
||||
}
|
||||
}
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
Status ApplyCompactCursors(int level,
|
||||
|
@ -845,6 +1044,7 @@ class VersionBuilder::Rep {
|
|||
|
||||
// Apply all of the edits in *edit to the current state.
|
||||
Status Apply(const VersionEdit* edit) {
|
||||
bool version_updated = false;
|
||||
{
|
||||
const Status s = CheckConsistency(base_vstorage_);
|
||||
if (!s.ok()) {
|
||||
|
@ -862,6 +1062,7 @@ class VersionBuilder::Rep {
|
|||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
version_updated = true;
|
||||
}
|
||||
|
||||
// Increase the amount of garbage for blob files affected by GC
|
||||
|
@ -870,6 +1071,7 @@ class VersionBuilder::Rep {
|
|||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
version_updated = true;
|
||||
}
|
||||
|
||||
// Delete table files
|
||||
|
@ -881,6 +1083,7 @@ class VersionBuilder::Rep {
|
|||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
version_updated = true;
|
||||
}
|
||||
|
||||
// Add new table files
|
||||
|
@ -892,6 +1095,7 @@ class VersionBuilder::Rep {
|
|||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
version_updated = true;
|
||||
}
|
||||
|
||||
// Populate compact cursors for round-robin compaction, leave
|
||||
|
@ -904,6 +1108,13 @@ class VersionBuilder::Rep {
|
|||
return s;
|
||||
}
|
||||
}
|
||||
|
||||
if (track_found_and_missing_files_ && version_updated) {
|
||||
version_updated_since_last_check_ = true;
|
||||
if (!edited_in_atomic_group_ && edit->IsInAtomicGroup()) {
|
||||
edited_in_atomic_group_ = true;
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
|
@ -1046,14 +1257,35 @@ class VersionBuilder::Rep {
|
|||
mutable_meta.GetGarbageBlobCount(), mutable_meta.GetGarbageBlobBytes());
|
||||
}
|
||||
|
||||
bool OnlyLinkedToMissingL0Files(
|
||||
const std::unordered_set<uint64_t>& linked_ssts) const {
|
||||
return std::all_of(
|
||||
linked_ssts.begin(), linked_ssts.end(), [&](const uint64_t& element) {
|
||||
return l0_missing_files_.find(element) != l0_missing_files_.end();
|
||||
});
|
||||
}
|
||||
|
||||
// Add the blob file specified by meta to *vstorage if it is determined to
|
||||
// contain valid data (blobs).
|
||||
template <typename Meta>
|
||||
static void AddBlobFileIfNeeded(VersionStorageInfo* vstorage, Meta&& meta) {
|
||||
void AddBlobFileIfNeeded(VersionStorageInfo* vstorage, Meta&& meta,
|
||||
uint64_t blob_file_number) const {
|
||||
assert(vstorage);
|
||||
assert(meta);
|
||||
|
||||
if (meta->GetLinkedSsts().empty() &&
|
||||
const auto& linked_ssts = meta->GetLinkedSsts();
|
||||
if (track_found_and_missing_files_) {
|
||||
if (missing_blob_files_.find(blob_file_number) !=
|
||||
missing_blob_files_.end()) {
|
||||
return;
|
||||
}
|
||||
// Leave the empty case for the below blob garbage collection logic.
|
||||
if (!linked_ssts.empty() && OnlyLinkedToMissingL0Files(linked_ssts)) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (linked_ssts.empty() &&
|
||||
meta->GetGarbageBlobCount() >= meta->GetTotalBlobCount()) {
|
||||
return;
|
||||
}
|
||||
|
@ -1065,6 +1297,7 @@ class VersionBuilder::Rep {
|
|||
// applied, and save the result into *vstorage.
|
||||
void SaveBlobFilesTo(VersionStorageInfo* vstorage) const {
|
||||
assert(vstorage);
|
||||
assert(!track_found_and_missing_files_ || valid_version_available_);
|
||||
|
||||
assert(base_vstorage_);
|
||||
vstorage->ReserveBlob(base_vstorage_->GetBlobFiles().size() +
|
||||
|
@ -1080,22 +1313,24 @@ class VersionBuilder::Rep {
|
|||
}
|
||||
|
||||
auto process_base =
|
||||
[vstorage](const std::shared_ptr<BlobFileMetaData>& base_meta) {
|
||||
[this, vstorage](const std::shared_ptr<BlobFileMetaData>& base_meta) {
|
||||
assert(base_meta);
|
||||
|
||||
AddBlobFileIfNeeded(vstorage, base_meta);
|
||||
AddBlobFileIfNeeded(vstorage, base_meta,
|
||||
base_meta->GetBlobFileNumber());
|
||||
|
||||
return true;
|
||||
};
|
||||
|
||||
auto process_mutable =
|
||||
[vstorage](const MutableBlobFileMetaData& mutable_meta) {
|
||||
AddBlobFileIfNeeded(vstorage, CreateBlobFileMetaData(mutable_meta));
|
||||
[this, vstorage](const MutableBlobFileMetaData& mutable_meta) {
|
||||
AddBlobFileIfNeeded(vstorage, CreateBlobFileMetaData(mutable_meta),
|
||||
mutable_meta.GetBlobFileNumber());
|
||||
|
||||
return true;
|
||||
};
|
||||
|
||||
auto process_both = [vstorage](
|
||||
auto process_both = [this, vstorage](
|
||||
const std::shared_ptr<BlobFileMetaData>& base_meta,
|
||||
const MutableBlobFileMetaData& mutable_meta) {
|
||||
assert(base_meta);
|
||||
|
@ -1108,12 +1343,14 @@ class VersionBuilder::Rep {
|
|||
mutable_meta.GetGarbageBlobBytes());
|
||||
assert(base_meta->GetLinkedSsts() == mutable_meta.GetLinkedSsts());
|
||||
|
||||
AddBlobFileIfNeeded(vstorage, base_meta);
|
||||
AddBlobFileIfNeeded(vstorage, base_meta,
|
||||
base_meta->GetBlobFileNumber());
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
AddBlobFileIfNeeded(vstorage, CreateBlobFileMetaData(mutable_meta));
|
||||
AddBlobFileIfNeeded(vstorage, CreateBlobFileMetaData(mutable_meta),
|
||||
mutable_meta.GetBlobFileNumber());
|
||||
|
||||
return true;
|
||||
};
|
||||
|
@ -1125,6 +1362,10 @@ class VersionBuilder::Rep {
|
|||
void MaybeAddFile(VersionStorageInfo* vstorage, int level,
|
||||
FileMetaData* f) const {
|
||||
const uint64_t file_number = f->fd.GetNumber();
|
||||
if (track_found_and_missing_files_ && level == 0 &&
|
||||
l0_missing_files_.find(file_number) != l0_missing_files_.end()) {
|
||||
return;
|
||||
}
|
||||
|
||||
const auto& level_state = levels_[level];
|
||||
|
||||
|
@ -1148,6 +1389,29 @@ class VersionBuilder::Rep {
|
|||
}
|
||||
}
|
||||
|
||||
bool ContainsCompleteVersion() const {
|
||||
assert(track_found_and_missing_files_);
|
||||
return l0_missing_files_.empty() && non_l0_missing_files_.empty() &&
|
||||
(missing_blob_files_high_ == kInvalidBlobFileNumber ||
|
||||
missing_blob_files_high_ < GetMinOldestBlobFileNumber());
|
||||
}
|
||||
|
||||
bool HasMissingFiles() const {
|
||||
assert(track_found_and_missing_files_);
|
||||
return !l0_missing_files_.empty() || !non_l0_missing_files_.empty() ||
|
||||
missing_blob_files_high_ != kInvalidBlobFileNumber;
|
||||
}
|
||||
|
||||
std::vector<std::string>& GetAndClearIntermediateFiles() {
|
||||
assert(track_found_and_missing_files_);
|
||||
return intermediate_files_;
|
||||
}
|
||||
|
||||
void ClearFoundFiles() {
|
||||
assert(track_found_and_missing_files_);
|
||||
found_files_.clear();
|
||||
}
|
||||
|
||||
template <typename Cmp>
|
||||
void SaveSSTFilesTo(VersionStorageInfo* vstorage, int level, Cmp cmp) const {
|
||||
// Merge the set of added files with the set of pre-existing files.
|
||||
|
@ -1156,6 +1420,16 @@ class VersionBuilder::Rep {
|
|||
const auto& unordered_added_files = levels_[level].added_files;
|
||||
vstorage->Reserve(level, base_files.size() + unordered_added_files.size());
|
||||
|
||||
MergeUnorderdAddedFilesWithBase(
|
||||
base_files, unordered_added_files, cmp,
|
||||
[&](FileMetaData* file) { MaybeAddFile(vstorage, level, file); });
|
||||
}
|
||||
|
||||
template <typename Cmp, typename AddFileFunc>
|
||||
void MergeUnorderdAddedFilesWithBase(
|
||||
const std::vector<FileMetaData*>& base_files,
|
||||
const std::unordered_map<uint64_t, FileMetaData*>& unordered_added_files,
|
||||
Cmp cmp, AddFileFunc add_file_func) const {
|
||||
// Sort added files for the level.
|
||||
std::vector<FileMetaData*> added_files;
|
||||
added_files.reserve(unordered_added_files.size());
|
||||
|
@ -1171,9 +1445,9 @@ class VersionBuilder::Rep {
|
|||
while (added_iter != added_end || base_iter != base_end) {
|
||||
if (base_iter == base_end ||
|
||||
(added_iter != added_end && cmp(*added_iter, *base_iter))) {
|
||||
MaybeAddFile(vstorage, level, *added_iter++);
|
||||
add_file_func(*added_iter++);
|
||||
} else {
|
||||
MaybeAddFile(vstorage, level, *base_iter++);
|
||||
add_file_func(*base_iter++);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1215,13 +1489,13 @@ class VersionBuilder::Rep {
|
|||
}
|
||||
|
||||
if (epoch_number_requirement == EpochNumberRequirement::kMightMissing) {
|
||||
SaveSSTFilesTo(vstorage, /* level */ 0, level_zero_cmp_by_seqno_);
|
||||
SaveSSTFilesTo(vstorage, /* level */ 0, *level_zero_cmp_by_seqno_);
|
||||
} else {
|
||||
SaveSSTFilesTo(vstorage, /* level */ 0, level_zero_cmp_by_epochno_);
|
||||
SaveSSTFilesTo(vstorage, /* level */ 0, *level_zero_cmp_by_epochno_);
|
||||
}
|
||||
|
||||
for (int level = 1; level < num_levels_; ++level) {
|
||||
SaveSSTFilesTo(vstorage, level, level_nonzero_cmp_);
|
||||
SaveSSTFilesTo(vstorage, level, *level_nonzero_cmp_);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1232,8 +1506,111 @@ class VersionBuilder::Rep {
|
|||
}
|
||||
}
|
||||
|
||||
bool ValidVersionAvailable() {
|
||||
assert(track_found_and_missing_files_);
|
||||
if (version_updated_since_last_check_) {
|
||||
valid_version_available_ = ContainsCompleteVersion();
|
||||
if (!valid_version_available_ && !edited_in_atomic_group_ &&
|
||||
allow_incomplete_valid_version_) {
|
||||
valid_version_available_ = OnlyMissingL0Suffix();
|
||||
}
|
||||
version_updated_since_last_check_ = false;
|
||||
}
|
||||
return valid_version_available_;
|
||||
}
|
||||
|
||||
bool OnlyMissingL0Suffix() const {
|
||||
if (!non_l0_missing_files_.empty()) {
|
||||
return false;
|
||||
}
|
||||
assert(!(l0_missing_files_.empty() && missing_blob_files_.empty()));
|
||||
|
||||
if (!l0_missing_files_.empty() && !MissingL0FilesAreL0Suffix()) {
|
||||
return false;
|
||||
}
|
||||
if (!missing_blob_files_.empty() &&
|
||||
!RemainingSstFilesNotMissingBlobFiles()) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Check missing L0 files are a suffix of expected sorted L0 files.
|
||||
bool MissingL0FilesAreL0Suffix() const {
|
||||
assert(non_l0_missing_files_.empty());
|
||||
assert(!l0_missing_files_.empty());
|
||||
std::vector<FileMetaData*> expected_sorted_l0_files;
|
||||
const auto& base_files = base_vstorage_->LevelFiles(0);
|
||||
const auto& unordered_added_files = levels_[0].added_files;
|
||||
expected_sorted_l0_files.reserve(base_files.size() +
|
||||
unordered_added_files.size());
|
||||
EpochNumberRequirement epoch_number_requirement =
|
||||
base_vstorage_->GetEpochNumberRequirement();
|
||||
|
||||
if (epoch_number_requirement == EpochNumberRequirement::kMightMissing) {
|
||||
MergeUnorderdAddedFilesWithBase(
|
||||
base_files, unordered_added_files, *level_zero_cmp_by_seqno_,
|
||||
[&](FileMetaData* file) {
|
||||
expected_sorted_l0_files.push_back(file);
|
||||
});
|
||||
} else {
|
||||
MergeUnorderdAddedFilesWithBase(
|
||||
base_files, unordered_added_files, *level_zero_cmp_by_epochno_,
|
||||
[&](FileMetaData* file) {
|
||||
expected_sorted_l0_files.push_back(file);
|
||||
});
|
||||
}
|
||||
assert(expected_sorted_l0_files.size() >= l0_missing_files_.size());
|
||||
std::unordered_set<uint64_t> unaddressed_missing_files = l0_missing_files_;
|
||||
for (auto iter = expected_sorted_l0_files.begin();
|
||||
iter != expected_sorted_l0_files.end(); iter++) {
|
||||
uint64_t file_number = (*iter)->fd.GetNumber();
|
||||
if (l0_missing_files_.find(file_number) != l0_missing_files_.end()) {
|
||||
assert(unaddressed_missing_files.find(file_number) !=
|
||||
unaddressed_missing_files.end());
|
||||
unaddressed_missing_files.erase(file_number);
|
||||
} else if (!unaddressed_missing_files.empty()) {
|
||||
return false;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Check for each of the missing blob file missing, it either is older than
|
||||
// the minimum oldest blob file required by this Version or only linked to
|
||||
// the missing L0 files.
|
||||
bool RemainingSstFilesNotMissingBlobFiles() const {
|
||||
assert(non_l0_missing_files_.empty());
|
||||
assert(!missing_blob_files_.empty());
|
||||
bool no_l0_files_missing = l0_missing_files_.empty();
|
||||
uint64_t min_oldest_blob_file_num = GetMinOldestBlobFileNumber();
|
||||
for (const auto& missing_blob_file : missing_blob_files_) {
|
||||
if (missing_blob_file < min_oldest_blob_file_num) {
|
||||
continue;
|
||||
}
|
||||
auto iter = mutable_blob_file_metas_.find(missing_blob_file);
|
||||
assert(iter != mutable_blob_file_metas_.end());
|
||||
const std::unordered_set<uint64_t>& linked_ssts =
|
||||
iter->second.GetLinkedSsts();
|
||||
// TODO(yuzhangyu): In theory, if no L0 SST files ara missing, and only
|
||||
// blob files exclusively linked to a L0 suffix are missing, we can
|
||||
// recover to a valid point in time too. We don't recover that type of
|
||||
// incomplete Version yet.
|
||||
if (!linked_ssts.empty() && no_l0_files_missing) {
|
||||
return false;
|
||||
}
|
||||
if (!OnlyLinkedToMissingL0Files(linked_ssts)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Save the current state in *vstorage.
|
||||
Status SaveTo(VersionStorageInfo* vstorage) const {
|
||||
assert(!track_found_and_missing_files_ || valid_version_available_);
|
||||
Status s;
|
||||
|
||||
#ifndef NDEBUG
|
||||
|
@ -1266,6 +1643,7 @@ class VersionBuilder::Rep {
|
|||
size_t max_file_size_for_l0_meta_pin, const ReadOptions& read_options,
|
||||
uint8_t block_protection_bytes_per_key) {
|
||||
assert(table_cache_ != nullptr);
|
||||
assert(!track_found_and_missing_files_ || valid_version_available_);
|
||||
|
||||
size_t table_cache_capacity =
|
||||
table_cache_->get_cache().get()->GetCapacity();
|
||||
|
@ -1305,6 +1683,11 @@ class VersionBuilder::Rep {
|
|||
for (int level = 0; level < num_levels_; level++) {
|
||||
for (auto& file_meta_pair : levels_[level].added_files) {
|
||||
auto* file_meta = file_meta_pair.second;
|
||||
uint64_t file_number = file_meta->fd.GetNumber();
|
||||
if (track_found_and_missing_files_ && level == 0 &&
|
||||
l0_missing_files_.find(file_number) != l0_missing_files_.end()) {
|
||||
continue;
|
||||
}
|
||||
// If the file has been opened before, just skip it.
|
||||
if (!file_meta->table_reader_handle) {
|
||||
files_meta.emplace_back(file_meta, level);
|
||||
|
@ -1369,9 +1752,13 @@ VersionBuilder::VersionBuilder(
|
|||
const FileOptions& file_options, const ImmutableCFOptions* ioptions,
|
||||
TableCache* table_cache, VersionStorageInfo* base_vstorage,
|
||||
VersionSet* version_set,
|
||||
std::shared_ptr<CacheReservationManager> file_metadata_cache_res_mgr)
|
||||
std::shared_ptr<CacheReservationManager> file_metadata_cache_res_mgr,
|
||||
ColumnFamilyData* cfd, VersionEditHandler* version_edit_handler,
|
||||
bool track_found_and_missing_files, bool allow_incomplete_valid_version)
|
||||
: rep_(new Rep(file_options, ioptions, table_cache, base_vstorage,
|
||||
version_set, file_metadata_cache_res_mgr)) {}
|
||||
version_set, file_metadata_cache_res_mgr, cfd,
|
||||
version_edit_handler, track_found_and_missing_files,
|
||||
allow_incomplete_valid_version)) {}
|
||||
|
||||
VersionBuilder::~VersionBuilder() = default;
|
||||
|
||||
|
@ -1399,27 +1786,71 @@ Status VersionBuilder::LoadTableHandlers(
|
|||
read_options, block_protection_bytes_per_key);
|
||||
}
|
||||
|
||||
uint64_t VersionBuilder::GetMinOldestBlobFileNumber() const {
|
||||
return rep_->GetMinOldestBlobFileNumber();
|
||||
void VersionBuilder::CreateOrReplaceSavePoint() {
|
||||
assert(rep_);
|
||||
savepoint_ = std::move(rep_);
|
||||
rep_ = std::make_unique<Rep>(*savepoint_);
|
||||
}
|
||||
|
||||
bool VersionBuilder::ValidVersionAvailable() {
|
||||
return rep_->ValidVersionAvailable();
|
||||
}
|
||||
|
||||
bool VersionBuilder::HasMissingFiles() const { return rep_->HasMissingFiles(); }
|
||||
|
||||
std::vector<std::string>& VersionBuilder::GetAndClearIntermediateFiles() {
|
||||
return rep_->GetAndClearIntermediateFiles();
|
||||
}
|
||||
|
||||
void VersionBuilder::ClearFoundFiles() { return rep_->ClearFoundFiles(); }
|
||||
|
||||
Status VersionBuilder::SaveSavePointTo(VersionStorageInfo* vstorage) const {
|
||||
if (!savepoint_ || !savepoint_->ValidVersionAvailable()) {
|
||||
return Status::InvalidArgument();
|
||||
}
|
||||
return savepoint_->SaveTo(vstorage);
|
||||
}
|
||||
|
||||
Status VersionBuilder::LoadSavePointTableHandlers(
|
||||
InternalStats* internal_stats, int max_threads,
|
||||
bool prefetch_index_and_filter_in_cache, bool is_initial_load,
|
||||
const std::shared_ptr<const SliceTransform>& prefix_extractor,
|
||||
size_t max_file_size_for_l0_meta_pin, const ReadOptions& read_options,
|
||||
uint8_t block_protection_bytes_per_key) {
|
||||
if (!savepoint_ || !savepoint_->ValidVersionAvailable()) {
|
||||
return Status::InvalidArgument();
|
||||
}
|
||||
return savepoint_->LoadTableHandlers(
|
||||
internal_stats, max_threads, prefetch_index_and_filter_in_cache,
|
||||
is_initial_load, prefix_extractor, max_file_size_for_l0_meta_pin,
|
||||
read_options, block_protection_bytes_per_key);
|
||||
}
|
||||
|
||||
void VersionBuilder::ClearSavePoint() { savepoint_.reset(nullptr); }
|
||||
|
||||
BaseReferencedVersionBuilder::BaseReferencedVersionBuilder(
|
||||
ColumnFamilyData* cfd)
|
||||
ColumnFamilyData* cfd, VersionEditHandler* version_edit_handler,
|
||||
bool track_found_and_missing_files, bool allow_incomplete_valid_version)
|
||||
: version_builder_(new VersionBuilder(
|
||||
cfd->current()->version_set()->file_options(), cfd->ioptions(),
|
||||
cfd->table_cache(), cfd->current()->storage_info(),
|
||||
cfd->current()->version_set(),
|
||||
cfd->GetFileMetadataCacheReservationManager())),
|
||||
cfd->GetFileMetadataCacheReservationManager(), cfd,
|
||||
version_edit_handler, track_found_and_missing_files,
|
||||
allow_incomplete_valid_version)),
|
||||
version_(cfd->current()) {
|
||||
version_->Ref();
|
||||
}
|
||||
|
||||
BaseReferencedVersionBuilder::BaseReferencedVersionBuilder(
|
||||
ColumnFamilyData* cfd, Version* v)
|
||||
ColumnFamilyData* cfd, Version* v, VersionEditHandler* version_edit_handler,
|
||||
bool track_found_and_missing_files, bool allow_incomplete_valid_version)
|
||||
: version_builder_(new VersionBuilder(
|
||||
cfd->current()->version_set()->file_options(), cfd->ioptions(),
|
||||
cfd->table_cache(), v->storage_info(), v->version_set(),
|
||||
cfd->GetFileMetadataCacheReservationManager())),
|
||||
cfd->GetFileMetadataCacheReservationManager(), cfd,
|
||||
version_edit_handler, track_found_and_missing_files,
|
||||
allow_incomplete_valid_version)),
|
||||
version_(v) {
|
||||
assert(version_ != cfd->current());
|
||||
}
|
||||
|
|
|
@ -26,6 +26,7 @@ struct FileMetaData;
|
|||
class InternalStats;
|
||||
class Version;
|
||||
class VersionSet;
|
||||
class VersionEditHandler;
|
||||
class ColumnFamilyData;
|
||||
class CacheReservationManager;
|
||||
|
||||
|
@ -38,22 +39,80 @@ class VersionBuilder {
|
|||
const ImmutableCFOptions* ioptions, TableCache* table_cache,
|
||||
VersionStorageInfo* base_vstorage, VersionSet* version_set,
|
||||
std::shared_ptr<CacheReservationManager>
|
||||
file_metadata_cache_res_mgr = nullptr);
|
||||
file_metadata_cache_res_mgr = nullptr,
|
||||
ColumnFamilyData* cfd = nullptr,
|
||||
VersionEditHandler* version_edit_handler = nullptr,
|
||||
bool track_found_and_missing_files = false,
|
||||
bool allow_incomplete_valid_version = false);
|
||||
~VersionBuilder();
|
||||
|
||||
bool CheckConsistencyForNumLevels();
|
||||
|
||||
Status Apply(const VersionEdit* edit);
|
||||
|
||||
// Save the current Version to the provided `vstorage`.
|
||||
Status SaveTo(VersionStorageInfo* vstorage) const;
|
||||
|
||||
// Load all the table handlers for the current Version in the builder.
|
||||
Status LoadTableHandlers(
|
||||
InternalStats* internal_stats, int max_threads,
|
||||
bool prefetch_index_and_filter_in_cache, bool is_initial_load,
|
||||
const std::shared_ptr<const SliceTransform>& prefix_extractor,
|
||||
size_t max_file_size_for_l0_meta_pin, const ReadOptions& read_options,
|
||||
uint8_t block_protection_bytes_per_key);
|
||||
uint64_t GetMinOldestBlobFileNumber() const;
|
||||
|
||||
//============APIs only used by VersionEditHandlerPointInTime ============//
|
||||
|
||||
// Creates a save point for the Version that has been built so far. Subsequent
|
||||
// VersionEdits applied to the builder will not affect the Version in this
|
||||
// save point. VersionBuilder currently only supports creating one save point,
|
||||
// so when `CreateOrReplaceSavePoint` is called again, the previous save point
|
||||
// is cleared. `ClearSavePoint` can be called explicitly to clear
|
||||
// the save point too.
|
||||
void CreateOrReplaceSavePoint();
|
||||
|
||||
// The builder can find all the files to build a `Version`. Or if
|
||||
// `allow_incomplete_valid_version_` is true and the version history is never
|
||||
// edited in an atomic group, and only a suffix of L0 SST files and their
|
||||
// associated blob files are missing.
|
||||
// From the users' perspective, missing a suffix of L0 files means missing the
|
||||
// user's most recently written data. So the remaining available files still
|
||||
// presents a valid point in time view, although for some previous time.
|
||||
// This validity check result will be cached and reused if the Version is not
|
||||
// updated between two validity checks.
|
||||
bool ValidVersionAvailable();
|
||||
|
||||
bool HasMissingFiles() const;
|
||||
|
||||
// When applying a sequence of VersionEdit, intermediate files are the ones
|
||||
// that are added and then deleted. The caller should clear this intermediate
|
||||
// files tracking after calling this API. So that the tracking for subsequent
|
||||
// VersionEdits can start over with a clean state.
|
||||
std::vector<std::string>& GetAndClearIntermediateFiles();
|
||||
|
||||
// Clearing all the found files in this Version.
|
||||
void ClearFoundFiles();
|
||||
|
||||
// Save the Version in the save point to the provided `vstorage`.
|
||||
// Non-OK status will be returned if there is not a valid save point.
|
||||
Status SaveSavePointTo(VersionStorageInfo* vstorage) const;
|
||||
|
||||
// Load all the table handlers for the Version in the save point.
|
||||
// Non-OK status will be returned if there is not a valid save point.
|
||||
Status LoadSavePointTableHandlers(
|
||||
InternalStats* internal_stats, int max_threads,
|
||||
bool prefetch_index_and_filter_in_cache, bool is_initial_load,
|
||||
const std::shared_ptr<const SliceTransform>& prefix_extractor,
|
||||
size_t max_file_size_for_l0_meta_pin, const ReadOptions& read_options,
|
||||
uint8_t block_protection_bytes_per_key);
|
||||
|
||||
void ClearSavePoint();
|
||||
|
||||
//======= End of APIs only used by VersionEditPointInTime==========//
|
||||
|
||||
private:
|
||||
class Rep;
|
||||
std::unique_ptr<Rep> savepoint_;
|
||||
std::unique_ptr<Rep> rep_;
|
||||
};
|
||||
|
||||
|
@ -62,8 +121,15 @@ class VersionBuilder {
|
|||
// Both of the constructor and destructor need to be called inside DB Mutex.
|
||||
class BaseReferencedVersionBuilder {
|
||||
public:
|
||||
explicit BaseReferencedVersionBuilder(ColumnFamilyData* cfd);
|
||||
BaseReferencedVersionBuilder(ColumnFamilyData* cfd, Version* v);
|
||||
explicit BaseReferencedVersionBuilder(
|
||||
ColumnFamilyData* cfd, VersionEditHandler* version_edit_handler = nullptr,
|
||||
bool track_found_and_missing_files = false,
|
||||
bool allow_incomplete_valid_version = false);
|
||||
BaseReferencedVersionBuilder(
|
||||
ColumnFamilyData* cfd, Version* v,
|
||||
VersionEditHandler* version_edit_handler = nullptr,
|
||||
bool track_found_and_missing_files = false,
|
||||
bool allow_incomplete_valid_version = false);
|
||||
~BaseReferencedVersionBuilder();
|
||||
VersionBuilder* version_builder() const { return version_builder_.get(); }
|
||||
|
||||
|
@ -71,23 +137,4 @@ class BaseReferencedVersionBuilder {
|
|||
std::unique_ptr<VersionBuilder> version_builder_;
|
||||
Version* version_;
|
||||
};
|
||||
|
||||
class NewestFirstBySeqNo {
|
||||
public:
|
||||
bool operator()(const FileMetaData* lhs, const FileMetaData* rhs) const {
|
||||
assert(lhs);
|
||||
assert(rhs);
|
||||
|
||||
if (lhs->fd.largest_seqno != rhs->fd.largest_seqno) {
|
||||
return lhs->fd.largest_seqno > rhs->fd.largest_seqno;
|
||||
}
|
||||
|
||||
if (lhs->fd.smallest_seqno != rhs->fd.smallest_seqno) {
|
||||
return lhs->fd.smallest_seqno > rhs->fd.smallest_seqno;
|
||||
}
|
||||
|
||||
// Break ties by file number
|
||||
return lhs->fd.GetNumber() > rhs->fd.GetNumber();
|
||||
}
|
||||
};
|
||||
} // namespace ROCKSDB_NAMESPACE
|
||||
|
|
|
@ -155,6 +155,7 @@ VersionEditHandler::VersionEditHandler(
|
|||
VersionSet* version_set, bool track_found_and_missing_files,
|
||||
bool no_error_if_files_missing, const std::shared_ptr<IOTracer>& io_tracer,
|
||||
const ReadOptions& read_options, bool skip_load_table_files,
|
||||
bool allow_incomplete_valid_version,
|
||||
EpochNumberRequirement epoch_number_requirement)
|
||||
: VersionEditHandlerBase(read_options),
|
||||
read_only_(read_only),
|
||||
|
@ -165,6 +166,7 @@ VersionEditHandler::VersionEditHandler(
|
|||
io_tracer_(io_tracer),
|
||||
skip_load_table_files_(skip_load_table_files),
|
||||
initialized_(false),
|
||||
allow_incomplete_valid_version_(allow_incomplete_valid_version),
|
||||
epoch_number_requirement_(epoch_number_requirement) {
|
||||
assert(version_set_ != nullptr);
|
||||
}
|
||||
|
@ -218,15 +220,15 @@ Status VersionEditHandler::ApplyVersionEdit(VersionEdit& edit,
|
|||
|
||||
Status VersionEditHandler::OnColumnFamilyAdd(VersionEdit& edit,
|
||||
ColumnFamilyData** cfd) {
|
||||
bool cf_in_not_found = false;
|
||||
bool do_not_open_cf = false;
|
||||
bool cf_in_builders = false;
|
||||
CheckColumnFamilyId(edit, &cf_in_not_found, &cf_in_builders);
|
||||
CheckColumnFamilyId(edit, &do_not_open_cf, &cf_in_builders);
|
||||
|
||||
assert(cfd != nullptr);
|
||||
*cfd = nullptr;
|
||||
const std::string& cf_name = edit.GetColumnFamilyName();
|
||||
Status s;
|
||||
if (cf_in_builders || cf_in_not_found) {
|
||||
if (cf_in_builders || do_not_open_cf) {
|
||||
s = Status::Corruption("MANIFEST adding the same column family twice: " +
|
||||
cf_name);
|
||||
}
|
||||
|
@ -239,7 +241,7 @@ Status VersionEditHandler::OnColumnFamilyAdd(VersionEdit& edit,
|
|||
cf_name.compare(kPersistentStatsColumnFamilyName) == 0;
|
||||
if (cf_options == name_to_options_.end() &&
|
||||
!is_persistent_stats_column_family) {
|
||||
column_families_not_found_.emplace(edit.GetColumnFamily(), cf_name);
|
||||
do_not_open_column_families_.emplace(edit.GetColumnFamily(), cf_name);
|
||||
} else {
|
||||
if (is_persistent_stats_column_family) {
|
||||
ColumnFamilyOptions cfo;
|
||||
|
@ -256,9 +258,9 @@ Status VersionEditHandler::OnColumnFamilyAdd(VersionEdit& edit,
|
|||
|
||||
Status VersionEditHandler::OnColumnFamilyDrop(VersionEdit& edit,
|
||||
ColumnFamilyData** cfd) {
|
||||
bool cf_in_not_found = false;
|
||||
bool do_not_open_cf = false;
|
||||
bool cf_in_builders = false;
|
||||
CheckColumnFamilyId(edit, &cf_in_not_found, &cf_in_builders);
|
||||
CheckColumnFamilyId(edit, &do_not_open_cf, &cf_in_builders);
|
||||
|
||||
assert(cfd != nullptr);
|
||||
*cfd = nullptr;
|
||||
|
@ -266,8 +268,8 @@ Status VersionEditHandler::OnColumnFamilyDrop(VersionEdit& edit,
|
|||
Status s;
|
||||
if (cf_in_builders) {
|
||||
tmp_cfd = DestroyCfAndCleanup(edit);
|
||||
} else if (cf_in_not_found) {
|
||||
column_families_not_found_.erase(edit.GetColumnFamily());
|
||||
} else if (do_not_open_cf) {
|
||||
do_not_open_column_families_.erase(edit.GetColumnFamily());
|
||||
} else {
|
||||
s = Status::Corruption("MANIFEST - dropping non-existing column family");
|
||||
}
|
||||
|
@ -288,22 +290,20 @@ Status VersionEditHandler::OnWalDeletion(VersionEdit& edit) {
|
|||
|
||||
Status VersionEditHandler::OnNonCfOperation(VersionEdit& edit,
|
||||
ColumnFamilyData** cfd) {
|
||||
bool cf_in_not_found = false;
|
||||
bool do_not_open_cf = false;
|
||||
bool cf_in_builders = false;
|
||||
CheckColumnFamilyId(edit, &cf_in_not_found, &cf_in_builders);
|
||||
CheckColumnFamilyId(edit, &do_not_open_cf, &cf_in_builders);
|
||||
|
||||
assert(cfd != nullptr);
|
||||
*cfd = nullptr;
|
||||
Status s;
|
||||
if (!cf_in_not_found) {
|
||||
if (!do_not_open_cf) {
|
||||
if (!cf_in_builders) {
|
||||
s = Status::Corruption(
|
||||
"MANIFEST record referencing unknown column family");
|
||||
}
|
||||
ColumnFamilyData* tmp_cfd = nullptr;
|
||||
if (s.ok()) {
|
||||
auto builder_iter = builders_.find(edit.GetColumnFamily());
|
||||
assert(builder_iter != builders_.end());
|
||||
tmp_cfd = version_set_->GetColumnFamilySet()->GetColumnFamily(
|
||||
edit.GetColumnFamily());
|
||||
assert(tmp_cfd != nullptr);
|
||||
|
@ -318,56 +318,33 @@ Status VersionEditHandler::OnNonCfOperation(VersionEdit& edit,
|
|||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
s = MaybeCreateVersion(edit, tmp_cfd, /*force_create_version=*/false);
|
||||
if (s.ok()) {
|
||||
s = builder_iter->second->version_builder()->Apply(&edit);
|
||||
}
|
||||
s = MaybeCreateVersionBeforeApplyEdit(edit, tmp_cfd,
|
||||
/*force_create_version=*/false);
|
||||
}
|
||||
*cfd = tmp_cfd;
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
// TODO maybe cache the computation result
|
||||
bool VersionEditHandler::HasMissingFiles() const {
|
||||
bool ret = false;
|
||||
for (const auto& elem : cf_to_missing_files_) {
|
||||
const auto& missing_files = elem.second;
|
||||
if (!missing_files.empty()) {
|
||||
ret = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!ret) {
|
||||
for (const auto& elem : cf_to_missing_blob_files_high_) {
|
||||
if (elem.second != kInvalidBlobFileNumber) {
|
||||
ret = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
void VersionEditHandler::CheckColumnFamilyId(const VersionEdit& edit,
|
||||
bool* cf_in_not_found,
|
||||
bool* do_not_open_cf,
|
||||
bool* cf_in_builders) const {
|
||||
assert(cf_in_not_found != nullptr);
|
||||
assert(do_not_open_cf != nullptr);
|
||||
assert(cf_in_builders != nullptr);
|
||||
// Not found means that user didn't supply that column
|
||||
// family option AND we encountered column family add
|
||||
// record. Once we encounter column family drop record,
|
||||
// we will delete the column family from
|
||||
// column_families_not_found.
|
||||
// do_not_open_column_families_.
|
||||
uint32_t cf_id = edit.GetColumnFamily();
|
||||
bool in_not_found = column_families_not_found_.find(cf_id) !=
|
||||
column_families_not_found_.end();
|
||||
bool in_do_not_open = do_not_open_column_families_.find(cf_id) !=
|
||||
do_not_open_column_families_.end();
|
||||
// in builders means that user supplied that column family
|
||||
// option AND that we encountered column family add record
|
||||
bool in_builders = builders_.find(cf_id) != builders_.end();
|
||||
// They cannot both be true
|
||||
assert(!(in_not_found && in_builders));
|
||||
*cf_in_not_found = in_not_found;
|
||||
assert(!(in_do_not_open && in_builders));
|
||||
*do_not_open_cf = in_do_not_open;
|
||||
*cf_in_builders = in_builders;
|
||||
}
|
||||
|
||||
|
@ -396,9 +373,9 @@ void VersionEditHandler::CheckIterationResult(const log::Reader& reader,
|
|||
// There were some column families in the MANIFEST that weren't specified
|
||||
// in the argument. This is OK in read_only mode
|
||||
if (s->ok() && MustOpenAllColumnFamilies() &&
|
||||
!column_families_not_found_.empty()) {
|
||||
!do_not_open_column_families_.empty()) {
|
||||
std::string msg;
|
||||
for (const auto& cf : column_families_not_found_) {
|
||||
for (const auto& cf : do_not_open_column_families_) {
|
||||
msg.append(", ");
|
||||
msg.append(cf.second);
|
||||
}
|
||||
|
@ -453,7 +430,8 @@ void VersionEditHandler::CheckIterationResult(const log::Reader& reader,
|
|||
}
|
||||
assert(cfd->initialized());
|
||||
VersionEdit edit;
|
||||
*s = MaybeCreateVersion(edit, cfd, /*force_create_version=*/true);
|
||||
*s = MaybeCreateVersionBeforeApplyEdit(edit, cfd,
|
||||
/*force_create_version=*/true);
|
||||
if (!s->ok()) {
|
||||
break;
|
||||
}
|
||||
|
@ -498,13 +476,9 @@ ColumnFamilyData* VersionEditHandler::CreateCfAndInit(
|
|||
assert(cfd != nullptr);
|
||||
cfd->set_initialized();
|
||||
assert(builders_.find(cf_id) == builders_.end());
|
||||
builders_.emplace(cf_id,
|
||||
VersionBuilderUPtr(new BaseReferencedVersionBuilder(cfd)));
|
||||
if (track_found_and_missing_files_) {
|
||||
cf_to_found_files_.emplace(cf_id, std::unordered_set<uint64_t>());
|
||||
cf_to_missing_files_.emplace(cf_id, std::unordered_set<uint64_t>());
|
||||
cf_to_missing_blob_files_high_.emplace(cf_id, kInvalidBlobFileNumber);
|
||||
}
|
||||
builders_.emplace(cf_id, VersionBuilderUPtr(new BaseReferencedVersionBuilder(
|
||||
cfd, this, track_found_and_missing_files_,
|
||||
allow_incomplete_valid_version_)));
|
||||
return cfd;
|
||||
}
|
||||
|
||||
|
@ -514,21 +488,6 @@ ColumnFamilyData* VersionEditHandler::DestroyCfAndCleanup(
|
|||
auto builder_iter = builders_.find(cf_id);
|
||||
assert(builder_iter != builders_.end());
|
||||
builders_.erase(builder_iter);
|
||||
if (track_found_and_missing_files_) {
|
||||
auto found_files_iter = cf_to_found_files_.find(cf_id);
|
||||
assert(found_files_iter != cf_to_found_files_.end());
|
||||
cf_to_found_files_.erase(found_files_iter);
|
||||
|
||||
auto missing_files_iter = cf_to_missing_files_.find(cf_id);
|
||||
assert(missing_files_iter != cf_to_missing_files_.end());
|
||||
cf_to_missing_files_.erase(missing_files_iter);
|
||||
|
||||
auto missing_blob_files_high_iter =
|
||||
cf_to_missing_blob_files_high_.find(cf_id);
|
||||
assert(missing_blob_files_high_iter !=
|
||||
cf_to_missing_blob_files_high_.end());
|
||||
cf_to_missing_blob_files_high_.erase(missing_blob_files_high_iter);
|
||||
}
|
||||
ColumnFamilyData* ret =
|
||||
version_set_->GetColumnFamilySet()->GetColumnFamily(cf_id);
|
||||
assert(ret != nullptr);
|
||||
|
@ -538,15 +497,14 @@ ColumnFamilyData* VersionEditHandler::DestroyCfAndCleanup(
|
|||
return ret;
|
||||
}
|
||||
|
||||
Status VersionEditHandler::MaybeCreateVersion(const VersionEdit& /*edit*/,
|
||||
ColumnFamilyData* cfd,
|
||||
bool force_create_version) {
|
||||
Status VersionEditHandler::MaybeCreateVersionBeforeApplyEdit(
|
||||
const VersionEdit& edit, ColumnFamilyData* cfd, bool force_create_version) {
|
||||
assert(cfd->initialized());
|
||||
Status s;
|
||||
auto builder_iter = builders_.find(cfd->GetID());
|
||||
assert(builder_iter != builders_.end());
|
||||
auto* builder = builder_iter->second->version_builder();
|
||||
if (force_create_version) {
|
||||
auto builder_iter = builders_.find(cfd->GetID());
|
||||
assert(builder_iter != builders_.end());
|
||||
auto* builder = builder_iter->second->version_builder();
|
||||
auto* v = new Version(cfd, version_set_, version_set_->file_options_,
|
||||
*cfd->GetLatestMutableCFOptions(), io_tracer_,
|
||||
version_set_->current_version_number_++,
|
||||
|
@ -562,6 +520,7 @@ Status VersionEditHandler::MaybeCreateVersion(const VersionEdit& /*edit*/,
|
|||
delete v;
|
||||
}
|
||||
}
|
||||
s = builder->Apply(&edit);
|
||||
return s;
|
||||
}
|
||||
|
||||
|
@ -731,12 +690,13 @@ Status VersionEditHandler::MaybeHandleFileBoundariesForNewFiles(
|
|||
VersionEditHandlerPointInTime::VersionEditHandlerPointInTime(
|
||||
bool read_only, std::vector<ColumnFamilyDescriptor> column_families,
|
||||
VersionSet* version_set, const std::shared_ptr<IOTracer>& io_tracer,
|
||||
const ReadOptions& read_options,
|
||||
const ReadOptions& read_options, bool allow_incomplete_valid_version,
|
||||
EpochNumberRequirement epoch_number_requirement)
|
||||
: VersionEditHandler(read_only, column_families, version_set,
|
||||
/*track_found_and_missing_files=*/true,
|
||||
/*no_error_if_files_missing=*/true, io_tracer,
|
||||
read_options, epoch_number_requirement) {}
|
||||
read_options, allow_incomplete_valid_version,
|
||||
epoch_number_requirement) {}
|
||||
|
||||
VersionEditHandlerPointInTime::~VersionEditHandlerPointInTime() {
|
||||
for (const auto& cfid_and_version : atomic_update_versions_) {
|
||||
|
@ -762,7 +722,8 @@ Status VersionEditHandlerPointInTime::OnAtomicGroupReplayBegin() {
|
|||
assert(!cfd->IsDropped());
|
||||
assert(cfd->initialized());
|
||||
VersionEdit edit;
|
||||
Status s = MaybeCreateVersion(edit, cfd, true /* force_create_version */);
|
||||
Status s = MaybeCreateVersionBeforeApplyEdit(
|
||||
edit, cfd, true /* force_create_version */);
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
|
@ -824,17 +785,17 @@ void VersionEditHandlerPointInTime::CheckIterationResult(
|
|||
}
|
||||
assert(cfd->initialized());
|
||||
auto v_iter = versions_.find(cfd->GetID());
|
||||
auto builder_iter = builders_.find(cfd->GetID());
|
||||
if (v_iter != versions_.end()) {
|
||||
assert(v_iter->second != nullptr);
|
||||
assert(builder_iter != builders_.end());
|
||||
|
||||
version_set_->AppendVersion(cfd, v_iter->second);
|
||||
versions_.erase(v_iter);
|
||||
// Let's clear found_files, since any files in that are part of the
|
||||
// installed Version. Any files that got obsoleted would have already
|
||||
// been moved to intermediate_files_
|
||||
auto found_files_iter = cf_to_found_files_.find(cfd->GetID());
|
||||
assert(found_files_iter != cf_to_found_files_.end());
|
||||
found_files_iter->second.clear();
|
||||
builder_iter->second->version_builder()->ClearFoundFiles();
|
||||
}
|
||||
}
|
||||
} else {
|
||||
|
@ -863,147 +824,50 @@ ColumnFamilyData* VersionEditHandlerPointInTime::DestroyCfAndCleanup(
|
|||
return cfd;
|
||||
}
|
||||
|
||||
Status VersionEditHandlerPointInTime::MaybeCreateVersion(
|
||||
Status VersionEditHandlerPointInTime::MaybeCreateVersionBeforeApplyEdit(
|
||||
const VersionEdit& edit, ColumnFamilyData* cfd, bool force_create_version) {
|
||||
TEST_SYNC_POINT("VersionEditHandlerPointInTime::MaybeCreateVersion:Begin1");
|
||||
TEST_SYNC_POINT("VersionEditHandlerPointInTime::MaybeCreateVersion:Begin2");
|
||||
TEST_SYNC_POINT(
|
||||
"VersionEditHandlerPointInTime::MaybeCreateVersionBeforeApplyEdit:"
|
||||
"Begin1");
|
||||
TEST_SYNC_POINT(
|
||||
"VersionEditHandlerPointInTime::MaybeCreateVersionBeforeApplyEdit:"
|
||||
"Begin2");
|
||||
assert(cfd != nullptr);
|
||||
if (!force_create_version) {
|
||||
assert(edit.GetColumnFamily() == cfd->GetID());
|
||||
}
|
||||
auto found_files_iter = cf_to_found_files_.find(cfd->GetID());
|
||||
assert(found_files_iter != cf_to_found_files_.end());
|
||||
std::unordered_set<uint64_t>& found_files = found_files_iter->second;
|
||||
|
||||
auto missing_files_iter = cf_to_missing_files_.find(cfd->GetID());
|
||||
assert(missing_files_iter != cf_to_missing_files_.end());
|
||||
std::unordered_set<uint64_t>& missing_files = missing_files_iter->second;
|
||||
|
||||
auto missing_blob_files_high_iter =
|
||||
cf_to_missing_blob_files_high_.find(cfd->GetID());
|
||||
assert(missing_blob_files_high_iter != cf_to_missing_blob_files_high_.end());
|
||||
const uint64_t prev_missing_blob_file_high =
|
||||
missing_blob_files_high_iter->second;
|
||||
|
||||
VersionBuilder* builder = nullptr;
|
||||
|
||||
if (prev_missing_blob_file_high != kInvalidBlobFileNumber) {
|
||||
auto builder_iter = builders_.find(cfd->GetID());
|
||||
assert(builder_iter != builders_.end());
|
||||
builder = builder_iter->second->version_builder();
|
||||
assert(builder != nullptr);
|
||||
}
|
||||
|
||||
// At this point, we have not yet applied the new version edits read from the
|
||||
// MANIFEST. We check whether we have any missing table and blob files.
|
||||
const bool prev_has_missing_files =
|
||||
!missing_files.empty() ||
|
||||
(prev_missing_blob_file_high != kInvalidBlobFileNumber &&
|
||||
prev_missing_blob_file_high >= builder->GetMinOldestBlobFileNumber());
|
||||
|
||||
for (const auto& file : edit.GetDeletedFiles()) {
|
||||
uint64_t file_num = file.second;
|
||||
auto fiter = missing_files.find(file_num);
|
||||
if (fiter != missing_files.end()) {
|
||||
missing_files.erase(fiter);
|
||||
} else {
|
||||
fiter = found_files.find(file_num);
|
||||
// Only mark new files added during this catchup attempt for deletion.
|
||||
// These files were never installed in VersionStorageInfo.
|
||||
// Already referenced files that are deleted by a VersionEdit will
|
||||
// be added to the VersionStorageInfo's obsolete files when the old
|
||||
// version is dereferenced.
|
||||
if (fiter != found_files.end()) {
|
||||
intermediate_files_.emplace_back(
|
||||
MakeTableFileName(cfd->ioptions()->cf_paths[0].path, file_num));
|
||||
found_files.erase(fiter);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
assert(!cfd->ioptions()->cf_paths.empty());
|
||||
Status s;
|
||||
for (const auto& elem : edit.GetNewFiles()) {
|
||||
int level = elem.first;
|
||||
const FileMetaData& meta = elem.second;
|
||||
const FileDescriptor& fd = meta.fd;
|
||||
uint64_t file_num = fd.GetNumber();
|
||||
const std::string fpath =
|
||||
MakeTableFileName(cfd->ioptions()->cf_paths[0].path, file_num);
|
||||
s = VerifyFile(cfd, fpath, level, meta);
|
||||
if (s.IsPathNotFound() || s.IsNotFound() || s.IsCorruption()) {
|
||||
missing_files.insert(file_num);
|
||||
if (s.IsCorruption()) {
|
||||
found_files.insert(file_num);
|
||||
}
|
||||
s = Status::OK();
|
||||
} else if (!s.ok()) {
|
||||
break;
|
||||
} else {
|
||||
found_files.insert(file_num);
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t missing_blob_file_num = prev_missing_blob_file_high;
|
||||
for (const auto& elem : edit.GetBlobFileAdditions()) {
|
||||
uint64_t file_num = elem.GetBlobFileNumber();
|
||||
s = VerifyBlobFile(cfd, file_num, elem);
|
||||
if (s.IsPathNotFound() || s.IsNotFound() || s.IsCorruption()) {
|
||||
missing_blob_file_num = std::max(missing_blob_file_num, file_num);
|
||||
s = Status::OK();
|
||||
} else if (!s.ok()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
bool has_missing_blob_files = false;
|
||||
if (missing_blob_file_num != kInvalidBlobFileNumber &&
|
||||
missing_blob_file_num >= prev_missing_blob_file_high) {
|
||||
missing_blob_files_high_iter->second = missing_blob_file_num;
|
||||
has_missing_blob_files = true;
|
||||
} else if (missing_blob_file_num < prev_missing_blob_file_high) {
|
||||
assert(false);
|
||||
}
|
||||
|
||||
// We still have not applied the new version edit, but have tried to add new
|
||||
// table and blob files after verifying their presence and consistency.
|
||||
// Therefore, we know whether we will see new missing table and blob files
|
||||
// later after actually applying the version edit. We perform the check here
|
||||
// and record the result.
|
||||
const bool has_missing_files =
|
||||
!missing_files.empty() || has_missing_blob_files;
|
||||
|
||||
bool missing_info = !version_edit_params_.HasLogNumber() ||
|
||||
!version_edit_params_.HasNextFile() ||
|
||||
!version_edit_params_.HasLastSequence();
|
||||
|
||||
// Create version before apply edit. The version will represent the state
|
||||
// before applying the version edit.
|
||||
Status s;
|
||||
auto builder_iter = builders_.find(cfd->GetID());
|
||||
assert(builder_iter != builders_.end());
|
||||
VersionBuilder* builder = builder_iter->second->version_builder();
|
||||
const bool valid_pit_before_edit = builder->ValidVersionAvailable();
|
||||
builder->CreateOrReplaceSavePoint();
|
||||
s = builder->Apply(&edit);
|
||||
const bool valid_pit_after_edit = builder->ValidVersionAvailable();
|
||||
|
||||
// A new version will be created if:
|
||||
// 1) no error has occurred so far, and
|
||||
// 2) log_number_, next_file_number_ and last_sequence_ are known, and
|
||||
// 3) not in an AtomicGroup
|
||||
// 4) any of the following:
|
||||
// a) no missing file before, but will have missing file(s) after applying
|
||||
// this version edit.
|
||||
// b) no missing file after applying the version edit, and the caller
|
||||
// explicitly request that a new version be created.
|
||||
// a) a valid Version is available before applying the edit
|
||||
// and a valid Version is not available after the edit.
|
||||
// b) a valid Version is available after the edit and the
|
||||
// caller explicitly request that a new version be created.
|
||||
if (s.ok() && !missing_info && !in_atomic_group_ &&
|
||||
((has_missing_files && !prev_has_missing_files) ||
|
||||
(!has_missing_files && force_create_version))) {
|
||||
if (!builder) {
|
||||
auto builder_iter = builders_.find(cfd->GetID());
|
||||
assert(builder_iter != builders_.end());
|
||||
builder = builder_iter->second->version_builder();
|
||||
assert(builder);
|
||||
}
|
||||
|
||||
((!valid_pit_after_edit && valid_pit_before_edit) ||
|
||||
(valid_pit_after_edit && force_create_version))) {
|
||||
const MutableCFOptions* cf_opts_ptr = cfd->GetLatestMutableCFOptions();
|
||||
auto* version = new Version(cfd, version_set_, version_set_->file_options_,
|
||||
*cf_opts_ptr, io_tracer_,
|
||||
version_set_->current_version_number_++,
|
||||
epoch_number_requirement_);
|
||||
s = builder->LoadTableHandlers(
|
||||
s = builder->LoadSavePointTableHandlers(
|
||||
cfd->internal_stats(),
|
||||
version_set_->db_options_->max_file_opening_threads, false, true,
|
||||
cf_opts_ptr->prefix_extractor, MaxFileSizeForL0MetaPin(*cf_opts_ptr),
|
||||
|
@ -1015,7 +879,7 @@ Status VersionEditHandlerPointInTime::MaybeCreateVersion(
|
|||
}
|
||||
return s;
|
||||
}
|
||||
s = builder->SaveTo(version->storage_info());
|
||||
s = builder->SaveSavePointTo(version->storage_info());
|
||||
if (s.ok()) {
|
||||
if (AtomicUpdateVersionsContains(cfd->GetID())) {
|
||||
AtomicUpdateVersionsPut(version);
|
||||
|
@ -1038,6 +902,8 @@ Status VersionEditHandlerPointInTime::MaybeCreateVersion(
|
|||
delete version;
|
||||
}
|
||||
}
|
||||
|
||||
builder->ClearSavePoint();
|
||||
return s;
|
||||
}
|
||||
|
||||
|
@ -1072,6 +938,15 @@ Status VersionEditHandlerPointInTime::LoadTables(
|
|||
return Status::OK();
|
||||
}
|
||||
|
||||
bool VersionEditHandlerPointInTime::HasMissingFiles() const {
|
||||
for (const auto& builder : builders_) {
|
||||
if (builder.second->version_builder()->HasMissingFiles()) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool VersionEditHandlerPointInTime::AtomicUpdateVersionsCompleted() {
|
||||
return atomic_update_versions_missing_ == 0;
|
||||
}
|
||||
|
@ -1145,8 +1020,9 @@ Status ManifestTailer::Initialize() {
|
|||
Version* base_version = dummy_version->Next();
|
||||
assert(base_version);
|
||||
base_version->Ref();
|
||||
VersionBuilderUPtr new_builder(
|
||||
new BaseReferencedVersionBuilder(default_cfd, base_version));
|
||||
VersionBuilderUPtr new_builder(new BaseReferencedVersionBuilder(
|
||||
default_cfd, base_version, this, track_found_and_missing_files_,
|
||||
allow_incomplete_valid_version_));
|
||||
builder_iter->second = std::move(new_builder);
|
||||
|
||||
initialized_ = true;
|
||||
|
@ -1189,8 +1065,8 @@ Status ManifestTailer::OnColumnFamilyAdd(VersionEdit& edit,
|
|||
Version* base_version = dummy_version->Next();
|
||||
assert(base_version);
|
||||
base_version->Ref();
|
||||
VersionBuilderUPtr new_builder(
|
||||
new BaseReferencedVersionBuilder(tmp_cfd, base_version));
|
||||
VersionBuilderUPtr new_builder(new BaseReferencedVersionBuilder(
|
||||
tmp_cfd, base_version, this, track_found_and_missing_files_));
|
||||
builder_iter->second = std::move(new_builder);
|
||||
|
||||
#ifndef NDEBUG
|
||||
|
@ -1213,6 +1089,18 @@ void ManifestTailer::CheckIterationResult(const log::Reader& reader,
|
|||
}
|
||||
}
|
||||
|
||||
std::vector<std::string> ManifestTailer::GetAndClearIntermediateFiles() {
|
||||
std::vector<std::string> res;
|
||||
for (const auto& builder : builders_) {
|
||||
auto files =
|
||||
builder.second->version_builder()->GetAndClearIntermediateFiles();
|
||||
res.insert(res.end(), std::make_move_iterator(files.begin()),
|
||||
std::make_move_iterator(files.end()));
|
||||
files.erase(files.begin(), files.end());
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
Status ManifestTailer::VerifyFile(ColumnFamilyData* cfd,
|
||||
const std::string& fpath, int level,
|
||||
const FileMetaData& fmeta) {
|
||||
|
|
|
@ -100,7 +100,9 @@ using VersionBuilderUPtr = std::unique_ptr<BaseReferencedVersionBuilder>;
|
|||
// A class used for scanning MANIFEST file.
|
||||
// VersionEditHandler reads a MANIFEST file, parses the version edits, and
|
||||
// builds the version set's in-memory state, e.g. the version storage info for
|
||||
// the versions of column families.
|
||||
// the versions of column families. It replays all the version edits in one
|
||||
// MANIFEST file to build the end version.
|
||||
//
|
||||
// To use this class and its subclasses,
|
||||
// 1. Create an object of VersionEditHandler or its subclasses.
|
||||
// VersionEditHandler handler(read_only, column_families, version_set,
|
||||
|
@ -119,13 +121,14 @@ class VersionEditHandler : public VersionEditHandlerBase {
|
|||
VersionSet* version_set, bool track_found_and_missing_files,
|
||||
bool no_error_if_files_missing,
|
||||
const std::shared_ptr<IOTracer>& io_tracer,
|
||||
const ReadOptions& read_options,
|
||||
const ReadOptions& read_options, bool allow_incomplete_valid_version,
|
||||
EpochNumberRequirement epoch_number_requirement =
|
||||
EpochNumberRequirement::kMustPresent)
|
||||
: VersionEditHandler(read_only, column_families, version_set,
|
||||
track_found_and_missing_files,
|
||||
no_error_if_files_missing, io_tracer, read_options,
|
||||
/*skip_load_table_files=*/false,
|
||||
allow_incomplete_valid_version,
|
||||
epoch_number_requirement) {}
|
||||
|
||||
~VersionEditHandler() override {}
|
||||
|
@ -134,14 +137,24 @@ class VersionEditHandler : public VersionEditHandlerBase {
|
|||
return version_edit_params_;
|
||||
}
|
||||
|
||||
bool HasMissingFiles() const;
|
||||
|
||||
void GetDbId(std::string* db_id) const {
|
||||
if (db_id && version_edit_params_.HasDbId()) {
|
||||
*db_id = version_edit_params_.GetDbId();
|
||||
}
|
||||
}
|
||||
|
||||
virtual Status VerifyFile(ColumnFamilyData* /*cfd*/,
|
||||
const std::string& /*fpath*/, int /*level*/,
|
||||
const FileMetaData& /*fmeta*/) {
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
virtual Status VerifyBlobFile(ColumnFamilyData* /*cfd*/,
|
||||
uint64_t /*blob_file_num*/,
|
||||
const BlobFileAddition& /*blob_addition*/) {
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
protected:
|
||||
explicit VersionEditHandler(
|
||||
bool read_only, std::vector<ColumnFamilyDescriptor> column_families,
|
||||
|
@ -149,6 +162,7 @@ class VersionEditHandler : public VersionEditHandlerBase {
|
|||
bool no_error_if_files_missing,
|
||||
const std::shared_ptr<IOTracer>& io_tracer,
|
||||
const ReadOptions& read_options, bool skip_load_table_files,
|
||||
bool allow_incomplete_valid_version,
|
||||
EpochNumberRequirement epoch_number_requirement =
|
||||
EpochNumberRequirement::kMustPresent);
|
||||
|
||||
|
@ -166,7 +180,7 @@ class VersionEditHandler : public VersionEditHandlerBase {
|
|||
|
||||
Status Initialize() override;
|
||||
|
||||
void CheckColumnFamilyId(const VersionEdit& edit, bool* cf_in_not_found,
|
||||
void CheckColumnFamilyId(const VersionEdit& edit, bool* do_not_open_cf,
|
||||
bool* cf_in_builders) const;
|
||||
|
||||
void CheckIterationResult(const log::Reader& reader, Status* s) override;
|
||||
|
@ -176,9 +190,9 @@ class VersionEditHandler : public VersionEditHandlerBase {
|
|||
|
||||
virtual ColumnFamilyData* DestroyCfAndCleanup(const VersionEdit& edit);
|
||||
|
||||
virtual Status MaybeCreateVersion(const VersionEdit& edit,
|
||||
ColumnFamilyData* cfd,
|
||||
bool force_create_version);
|
||||
virtual Status MaybeCreateVersionBeforeApplyEdit(const VersionEdit& edit,
|
||||
ColumnFamilyData* cfd,
|
||||
bool force_create_version);
|
||||
|
||||
virtual Status LoadTables(ColumnFamilyData* cfd,
|
||||
bool prefetch_index_and_filter_in_cache,
|
||||
|
@ -191,21 +205,23 @@ class VersionEditHandler : public VersionEditHandlerBase {
|
|||
VersionSet* version_set_;
|
||||
std::unordered_map<uint32_t, VersionBuilderUPtr> builders_;
|
||||
std::unordered_map<std::string, ColumnFamilyOptions> name_to_options_;
|
||||
// Keeps track of column families in manifest that were not found in
|
||||
// column families parameters. if those column families are not dropped
|
||||
// by subsequent manifest records, Recover() will return failure status.
|
||||
std::unordered_map<uint32_t, std::string> column_families_not_found_;
|
||||
VersionEditParams version_edit_params_;
|
||||
const bool track_found_and_missing_files_;
|
||||
std::unordered_map<uint32_t, std::unordered_set<uint64_t>> cf_to_found_files_;
|
||||
std::unordered_map<uint32_t, std::unordered_set<uint64_t>>
|
||||
cf_to_missing_files_;
|
||||
std::unordered_map<uint32_t, uint64_t> cf_to_missing_blob_files_high_;
|
||||
// Keeps track of column families in manifest that were not found in
|
||||
// column families parameters. Namely, the user asks to not open these column
|
||||
// families. In non read only mode, if those column families are not dropped
|
||||
// by subsequent manifest records, Recover() will return failure status.
|
||||
std::unordered_map<uint32_t, std::string> do_not_open_column_families_;
|
||||
VersionEditParams version_edit_params_;
|
||||
bool no_error_if_files_missing_;
|
||||
std::shared_ptr<IOTracer> io_tracer_;
|
||||
bool skip_load_table_files_;
|
||||
bool initialized_;
|
||||
std::unique_ptr<std::unordered_map<uint32_t, std::string>> cf_to_cmp_names_;
|
||||
// If false, only a complete Version for which all files consisting it can be
|
||||
// found is considered a valid Version. If true, besides complete Version, an
|
||||
// incomplete Version with only a suffix of L0 files missing is also
|
||||
// considered valid if the Version is never edited in an atomic group.
|
||||
const bool allow_incomplete_valid_version_;
|
||||
EpochNumberRequirement epoch_number_requirement_;
|
||||
std::unordered_set<uint32_t> cfds_to_mark_no_udt_;
|
||||
|
||||
|
@ -226,8 +242,18 @@ class VersionEditHandler : public VersionEditHandlerBase {
|
|||
};
|
||||
|
||||
// A class similar to its base class, i.e. VersionEditHandler.
|
||||
// VersionEditHandlerPointInTime restores the versions to the most recent point
|
||||
// in time such that at this point, the version does not have missing files.
|
||||
// Unlike VersionEditHandler that only aims to build the end version, this class
|
||||
// supports building the most recent point in time version. A point in time
|
||||
// version is a version for which no files are missing, or if
|
||||
// `allow_incomplete_valid_version` is true, only a suffix of L0 files (and
|
||||
// their associated blob files) are missing.
|
||||
//
|
||||
// Building a point in time version when end version is not available can
|
||||
// be useful for best efforts recovery (options.best_efforts_recovery), which
|
||||
// uses this class and sets `allow_incomplete_valid_version` to true.
|
||||
// It's also useful for secondary instances/follower instances for which end
|
||||
// version could be transiently unavailable. These two cases use subclass
|
||||
// `ManifestTailer` and sets `allow_incomplete_valid_version` to false.
|
||||
//
|
||||
// Not thread-safe, external synchronization is necessary if an object of
|
||||
// VersionEditHandlerPointInTime is shared by multiple threads.
|
||||
|
@ -236,28 +262,32 @@ class VersionEditHandlerPointInTime : public VersionEditHandler {
|
|||
VersionEditHandlerPointInTime(
|
||||
bool read_only, std::vector<ColumnFamilyDescriptor> column_families,
|
||||
VersionSet* version_set, const std::shared_ptr<IOTracer>& io_tracer,
|
||||
const ReadOptions& read_options,
|
||||
const ReadOptions& read_options, bool allow_incomplete_valid_version,
|
||||
EpochNumberRequirement epoch_number_requirement =
|
||||
EpochNumberRequirement::kMustPresent);
|
||||
~VersionEditHandlerPointInTime() override;
|
||||
|
||||
bool HasMissingFiles() const;
|
||||
|
||||
virtual Status VerifyFile(ColumnFamilyData* cfd, const std::string& fpath,
|
||||
int level, const FileMetaData& fmeta) override;
|
||||
virtual Status VerifyBlobFile(ColumnFamilyData* cfd, uint64_t blob_file_num,
|
||||
const BlobFileAddition& blob_addition) override;
|
||||
|
||||
protected:
|
||||
Status OnAtomicGroupReplayBegin() override;
|
||||
Status OnAtomicGroupReplayEnd() override;
|
||||
void CheckIterationResult(const log::Reader& reader, Status* s) override;
|
||||
|
||||
ColumnFamilyData* DestroyCfAndCleanup(const VersionEdit& edit) override;
|
||||
// `MaybeCreateVersion(..., false)` creates a version upon a negative edge
|
||||
// trigger (transition from valid to invalid).
|
||||
// `MaybeCreateVersionBeforeApplyEdit(..., false)` creates a version upon a
|
||||
// negative edge trigger (transition from valid to invalid).
|
||||
//
|
||||
// `MaybeCreateVersion(..., true)` creates a version on a positive level
|
||||
// trigger (state is valid).
|
||||
Status MaybeCreateVersion(const VersionEdit& edit, ColumnFamilyData* cfd,
|
||||
bool force_create_version) override;
|
||||
virtual Status VerifyFile(ColumnFamilyData* cfd, const std::string& fpath,
|
||||
int level, const FileMetaData& fmeta);
|
||||
virtual Status VerifyBlobFile(ColumnFamilyData* cfd, uint64_t blob_file_num,
|
||||
const BlobFileAddition& blob_addition);
|
||||
// `MaybeCreateVersionBeforeApplyEdit(..., true)` creates a version on a
|
||||
// positive level trigger (state is valid).
|
||||
Status MaybeCreateVersionBeforeApplyEdit(const VersionEdit& edit,
|
||||
ColumnFamilyData* cfd,
|
||||
bool force_create_version) override;
|
||||
|
||||
Status LoadTables(ColumnFamilyData* cfd,
|
||||
bool prefetch_index_and_filter_in_cache,
|
||||
|
@ -275,8 +305,6 @@ class VersionEditHandlerPointInTime : public VersionEditHandler {
|
|||
|
||||
bool in_atomic_group_ = false;
|
||||
|
||||
std::vector<std::string> intermediate_files_;
|
||||
|
||||
private:
|
||||
bool AtomicUpdateVersionsCompleted();
|
||||
bool AtomicUpdateVersionsContains(uint32_t cfid);
|
||||
|
@ -292,6 +320,12 @@ class VersionEditHandlerPointInTime : public VersionEditHandler {
|
|||
void AtomicUpdateVersionsApply();
|
||||
};
|
||||
|
||||
// A class similar to `VersionEditHandlerPointInTime` that parse MANIFEST and
|
||||
// builds point in time version.
|
||||
// `ManifestTailer` supports reading one MANIFEST file in multiple tailing
|
||||
// attempts and supports switching to a different MANIFEST after
|
||||
// `PrepareToReadNewManifest` is called. This class is used by secondary and
|
||||
// follower instance.
|
||||
class ManifestTailer : public VersionEditHandlerPointInTime {
|
||||
public:
|
||||
explicit ManifestTailer(std::vector<ColumnFamilyDescriptor> column_families,
|
||||
|
@ -302,9 +336,13 @@ class ManifestTailer : public VersionEditHandlerPointInTime {
|
|||
EpochNumberRequirement::kMustPresent)
|
||||
: VersionEditHandlerPointInTime(/*read_only=*/false, column_families,
|
||||
version_set, io_tracer, read_options,
|
||||
/*allow_incomplete_valid_version=*/false,
|
||||
epoch_number_requirement),
|
||||
mode_(Mode::kRecovery) {}
|
||||
|
||||
Status VerifyFile(ColumnFamilyData* cfd, const std::string& fpath, int level,
|
||||
const FileMetaData& fmeta) override;
|
||||
|
||||
void PrepareToReadNewManifest() {
|
||||
initialized_ = false;
|
||||
ClearReadBuffer();
|
||||
|
@ -314,9 +352,7 @@ class ManifestTailer : public VersionEditHandlerPointInTime {
|
|||
return cfds_changed_;
|
||||
}
|
||||
|
||||
std::vector<std::string>& GetIntermediateFiles() {
|
||||
return intermediate_files_;
|
||||
}
|
||||
std::vector<std::string> GetAndClearIntermediateFiles();
|
||||
|
||||
protected:
|
||||
Status Initialize() override;
|
||||
|
@ -329,9 +365,6 @@ class ManifestTailer : public VersionEditHandlerPointInTime {
|
|||
|
||||
void CheckIterationResult(const log::Reader& reader, Status* s) override;
|
||||
|
||||
Status VerifyFile(ColumnFamilyData* cfd, const std::string& fpath, int level,
|
||||
const FileMetaData& fmeta) override;
|
||||
|
||||
enum Mode : uint8_t {
|
||||
kRecovery = 0,
|
||||
kCatchUp = 1,
|
||||
|
@ -352,7 +385,9 @@ class DumpManifestHandler : public VersionEditHandler {
|
|||
/*read_only=*/true, column_families, version_set,
|
||||
/*track_found_and_missing_files=*/false,
|
||||
/*no_error_if_files_missing=*/false, io_tracer, read_options,
|
||||
/*skip_load_table_files=*/true),
|
||||
/*skip_load_table_files=*/true,
|
||||
/*allow_incomplete_valid_version=*/false,
|
||||
/*epoch_number_requirement=*/EpochNumberRequirement::kMustPresent),
|
||||
verbose_(verbose),
|
||||
hex_(hex),
|
||||
json_(json),
|
||||
|
|
|
@ -5511,6 +5511,10 @@ Status VersionSet::ProcessManifestWrites(
|
|||
std::unique_ptr<log::Writer> new_desc_log_ptr;
|
||||
{
|
||||
FileOptions opt_file_opts = fs_->OptimizeForManifestWrite(file_options_);
|
||||
// DB option (in file_options_) takes precedence when not kUnknown
|
||||
if (file_options_.temperature != Temperature::kUnknown) {
|
||||
opt_file_opts.temperature = file_options_.temperature;
|
||||
}
|
||||
mu->Unlock();
|
||||
TEST_SYNC_POINT("VersionSet::LogAndApply:WriteManifestStart");
|
||||
TEST_SYNC_POINT_CALLBACK("VersionSet::LogAndApply:WriteManifest", nullptr);
|
||||
|
@ -5637,9 +5641,9 @@ Status VersionSet::ProcessManifestWrites(
|
|||
assert(manifest_io_status.ok());
|
||||
}
|
||||
if (s.ok() && new_descriptor_log) {
|
||||
io_s = SetCurrentFile(write_options, fs_.get(), dbname_,
|
||||
pending_manifest_file_number_,
|
||||
dir_contains_current_file);
|
||||
io_s = SetCurrentFile(
|
||||
write_options, fs_.get(), dbname_, pending_manifest_file_number_,
|
||||
file_options_.temperature, dir_contains_current_file);
|
||||
if (!io_s.ok()) {
|
||||
s = io_s;
|
||||
// Quarantine old manifest file in case new manifest file's CURRENT file
|
||||
|
@ -6080,7 +6084,8 @@ Status VersionSet::Recover(
|
|||
VersionEditHandler handler(
|
||||
read_only, column_families, const_cast<VersionSet*>(this),
|
||||
/*track_found_and_missing_files=*/false, no_error_if_files_missing,
|
||||
io_tracer_, read_options, EpochNumberRequirement::kMightMissing);
|
||||
io_tracer_, read_options, /*allow_incomplete_valid_version=*/false,
|
||||
EpochNumberRequirement::kMightMissing);
|
||||
handler.Iterate(reader, &log_read_status);
|
||||
s = handler.status();
|
||||
if (s.ok()) {
|
||||
|
@ -6256,7 +6261,8 @@ Status VersionSet::TryRecoverFromOneManifest(
|
|||
/*checksum=*/true, /*log_num=*/0);
|
||||
VersionEditHandlerPointInTime handler_pit(
|
||||
read_only, column_families, const_cast<VersionSet*>(this), io_tracer_,
|
||||
read_options, EpochNumberRequirement::kMightMissing);
|
||||
read_options, /*allow_incomplete_valid_version=*/true,
|
||||
EpochNumberRequirement::kMightMissing);
|
||||
|
||||
handler_pit.Iterate(reader, &s);
|
||||
|
||||
|
@ -7477,7 +7483,7 @@ Status ReactiveVersionSet::ReadAndApply(
|
|||
*cfds_changed = std::move(manifest_tailer_->GetUpdatedColumnFamilies());
|
||||
}
|
||||
if (files_to_delete) {
|
||||
*files_to_delete = std::move(manifest_tailer_->GetIntermediateFiles());
|
||||
*files_to_delete = manifest_tailer_->GetAndClearIntermediateFiles();
|
||||
}
|
||||
|
||||
return s;
|
||||
|
|
|
@ -1277,6 +1277,15 @@ class VersionSet {
|
|||
bool no_error_if_files_missing = false, bool is_retry = false,
|
||||
Status* log_status = nullptr);
|
||||
|
||||
// Do a best-efforts recovery (Options.best_efforts_recovery=true) from all
|
||||
// available MANIFEST files. Similar to `Recover` with these differences:
|
||||
// 1) not only the latest MANIFEST can be used, if it's not available or
|
||||
// no successful recovery can be achieved with it, this function also tries
|
||||
// to recover from previous MANIFEST files, in reverse chronological order
|
||||
// until a successful recovery can be achieved.
|
||||
// 2) this function doesn't just aim to recover to the latest version, if that
|
||||
// is not available, the most recent point in time version will be saved in
|
||||
// memory. Check doc for `VersionEditHandlerPointInTime` for more details.
|
||||
Status TryRecover(const std::vector<ColumnFamilyDescriptor>& column_families,
|
||||
bool read_only,
|
||||
const std::vector<std::string>& files_in_dbname,
|
||||
|
|
|
@ -11,6 +11,7 @@
|
|||
|
||||
#include <algorithm>
|
||||
|
||||
#include "db/blob/blob_log_writer.h"
|
||||
#include "db/db_impl/db_impl.h"
|
||||
#include "db/db_test_util.h"
|
||||
#include "db/log_writer.h"
|
||||
|
@ -1345,18 +1346,27 @@ class VersionSetTestBase {
|
|||
std::string key; // the only key
|
||||
int level = 0;
|
||||
uint64_t epoch_number;
|
||||
bool file_missing = false;
|
||||
uint64_t oldest_blob_file_number = kInvalidBlobFileNumber;
|
||||
SstInfo(uint64_t file_num, const std::string& cf_name,
|
||||
const std::string& _key,
|
||||
uint64_t _epoch_number = kUnknownEpochNumber)
|
||||
: SstInfo(file_num, cf_name, _key, 0, _epoch_number) {}
|
||||
uint64_t _epoch_number = kUnknownEpochNumber,
|
||||
bool _file_missing = false,
|
||||
uint64_t _oldest_blob_file_number = kInvalidBlobFileNumber)
|
||||
: SstInfo(file_num, cf_name, _key, 0, _epoch_number, _file_missing,
|
||||
_oldest_blob_file_number) {}
|
||||
SstInfo(uint64_t file_num, const std::string& cf_name,
|
||||
const std::string& _key, int lvl,
|
||||
uint64_t _epoch_number = kUnknownEpochNumber)
|
||||
uint64_t _epoch_number = kUnknownEpochNumber,
|
||||
bool _file_missing = false,
|
||||
uint64_t _oldest_blob_file_number = kInvalidBlobFileNumber)
|
||||
: file_number(file_num),
|
||||
column_family(cf_name),
|
||||
key(_key),
|
||||
level(lvl),
|
||||
epoch_number(_epoch_number) {}
|
||||
epoch_number(_epoch_number),
|
||||
file_missing(_file_missing),
|
||||
oldest_blob_file_number(_oldest_blob_file_number) {}
|
||||
};
|
||||
|
||||
// Create dummy sst, return their metadata. Note that only file name and size
|
||||
|
@ -1395,22 +1405,32 @@ class VersionSetTestBase {
|
|||
ASSERT_NE(0, file_size);
|
||||
file_metas->emplace_back(
|
||||
file_num, /*file_path_id=*/0, file_size, ikey, ikey, 0, 0, false,
|
||||
Temperature::kUnknown, 0, 0, 0, info.epoch_number,
|
||||
kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2,
|
||||
0, 0, /* user_defined_timestamps_persisted */ true);
|
||||
Temperature::kUnknown, info.oldest_blob_file_number, 0, 0,
|
||||
info.epoch_number, kUnknownFileChecksum, kUnknownFileChecksumFuncName,
|
||||
kNullUniqueId64x2, 0, 0,
|
||||
/* user_defined_timestamps_persisted */ true);
|
||||
if (info.file_missing) {
|
||||
ASSERT_OK(fs_->DeleteFile(fname, IOOptions(), nullptr));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void CreateCurrentFile() {
|
||||
// Make "CURRENT" file point to the new manifest file.
|
||||
ASSERT_OK(SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1,
|
||||
Temperature::kUnknown,
|
||||
/* dir_contains_current_file */ nullptr));
|
||||
}
|
||||
|
||||
// Create DB with 3 column families.
|
||||
void NewDB() {
|
||||
SequenceNumber last_seqno;
|
||||
std::unique_ptr<log::Writer> log_writer;
|
||||
ASSERT_OK(SetIdentityFile(WriteOptions(), env_, dbname_));
|
||||
ASSERT_OK(
|
||||
SetIdentityFile(WriteOptions(), env_, dbname_, Temperature::kUnknown));
|
||||
PrepareManifest(&column_families_, &last_seqno, &log_writer);
|
||||
log_writer.reset();
|
||||
// Make "CURRENT" file point to the new manifest file.
|
||||
Status s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr);
|
||||
ASSERT_OK(s);
|
||||
CreateCurrentFile();
|
||||
|
||||
EXPECT_OK(versions_->Recover(column_families_, false));
|
||||
EXPECT_EQ(column_families_.size(),
|
||||
|
@ -2586,7 +2606,7 @@ class VersionSetAtomicGroupTest : public VersionSetTestBase,
|
|||
edits_[i].MarkAtomicGroup(--remaining);
|
||||
edits_[i].SetLastSequence(last_seqno_++);
|
||||
}
|
||||
ASSERT_OK(SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr));
|
||||
CreateCurrentFile();
|
||||
}
|
||||
|
||||
void SetupIncompleteTrailingAtomicGroup(int atomic_group_size) {
|
||||
|
@ -2598,7 +2618,7 @@ class VersionSetAtomicGroupTest : public VersionSetTestBase,
|
|||
edits_[i].MarkAtomicGroup(--remaining);
|
||||
edits_[i].SetLastSequence(last_seqno_++);
|
||||
}
|
||||
ASSERT_OK(SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr));
|
||||
CreateCurrentFile();
|
||||
}
|
||||
|
||||
void SetupCorruptedAtomicGroup(int atomic_group_size) {
|
||||
|
@ -2612,7 +2632,7 @@ class VersionSetAtomicGroupTest : public VersionSetTestBase,
|
|||
}
|
||||
edits_[i].SetLastSequence(last_seqno_++);
|
||||
}
|
||||
ASSERT_OK(SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr));
|
||||
CreateCurrentFile();
|
||||
}
|
||||
|
||||
void SetupIncorrectAtomicGroup(int atomic_group_size) {
|
||||
|
@ -2628,7 +2648,7 @@ class VersionSetAtomicGroupTest : public VersionSetTestBase,
|
|||
}
|
||||
edits_[i].SetLastSequence(last_seqno_++);
|
||||
}
|
||||
ASSERT_OK(SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr));
|
||||
CreateCurrentFile();
|
||||
}
|
||||
|
||||
void SetupTestSyncPoints() {
|
||||
|
@ -3394,8 +3414,7 @@ TEST_P(VersionSetTestDropOneCF, HandleDroppedColumnFamilyInAtomicGroup) {
|
|||
SequenceNumber last_seqno;
|
||||
std::unique_ptr<log::Writer> log_writer;
|
||||
PrepareManifest(&column_families, &last_seqno, &log_writer);
|
||||
Status s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr);
|
||||
ASSERT_OK(s);
|
||||
CreateCurrentFile();
|
||||
|
||||
EXPECT_OK(versions_->Recover(column_families, false /* read_only */));
|
||||
EXPECT_EQ(column_families.size(),
|
||||
|
@ -3417,7 +3436,7 @@ TEST_P(VersionSetTestDropOneCF, HandleDroppedColumnFamilyInAtomicGroup) {
|
|||
cfd_to_drop->Ref();
|
||||
drop_cf_edit.SetColumnFamily(cfd_to_drop->GetID());
|
||||
mutex_.Lock();
|
||||
s = versions_->LogAndApply(
|
||||
Status s = versions_->LogAndApply(
|
||||
cfd_to_drop, *cfd_to_drop->GetLatestMutableCFOptions(), read_options,
|
||||
write_options, &drop_cf_edit, &mutex_, nullptr);
|
||||
mutex_.Unlock();
|
||||
|
@ -3527,9 +3546,7 @@ class EmptyDefaultCfNewManifest : public VersionSetTestBase,
|
|||
TEST_F(EmptyDefaultCfNewManifest, Recover) {
|
||||
PrepareManifest(nullptr, nullptr, &log_writer_);
|
||||
log_writer_.reset();
|
||||
Status s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1,
|
||||
/* dir_contains_current_file */ nullptr);
|
||||
ASSERT_OK(s);
|
||||
CreateCurrentFile();
|
||||
std::string manifest_path;
|
||||
VerifyManifest(&manifest_path);
|
||||
std::vector<ColumnFamilyDescriptor> column_families;
|
||||
|
@ -3538,7 +3555,7 @@ TEST_F(EmptyDefaultCfNewManifest, Recover) {
|
|||
cf_options_);
|
||||
std::string db_id;
|
||||
bool has_missing_table_file = false;
|
||||
s = versions_->TryRecoverFromOneManifest(
|
||||
Status s = versions_->TryRecoverFromOneManifest(
|
||||
manifest_path, column_families, false, &db_id, &has_missing_table_file);
|
||||
ASSERT_OK(s);
|
||||
ASSERT_FALSE(has_missing_table_file);
|
||||
|
@ -3559,7 +3576,8 @@ class VersionSetTestEmptyDb
|
|||
assert(nullptr != log_writer);
|
||||
VersionEdit new_db;
|
||||
if (db_options_.write_dbid_to_manifest) {
|
||||
ASSERT_OK(SetIdentityFile(WriteOptions(), env_, dbname_));
|
||||
ASSERT_OK(SetIdentityFile(WriteOptions(), env_, dbname_,
|
||||
Temperature::kUnknown));
|
||||
DBOptions tmp_db_options;
|
||||
tmp_db_options.env = env_;
|
||||
std::unique_ptr<DBImpl> impl(new DBImpl(tmp_db_options, dbname_));
|
||||
|
@ -3592,9 +3610,7 @@ TEST_P(VersionSetTestEmptyDb, OpenFromIncompleteManifest0) {
|
|||
db_options_.write_dbid_to_manifest = std::get<0>(GetParam());
|
||||
PrepareManifest(nullptr, nullptr, &log_writer_);
|
||||
log_writer_.reset();
|
||||
Status s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1,
|
||||
/* dir_contains_current_file */ nullptr);
|
||||
ASSERT_OK(s);
|
||||
CreateCurrentFile();
|
||||
|
||||
std::string manifest_path;
|
||||
VerifyManifest(&manifest_path);
|
||||
|
@ -3609,9 +3625,9 @@ TEST_P(VersionSetTestEmptyDb, OpenFromIncompleteManifest0) {
|
|||
|
||||
std::string db_id;
|
||||
bool has_missing_table_file = false;
|
||||
s = versions_->TryRecoverFromOneManifest(manifest_path, column_families,
|
||||
read_only, &db_id,
|
||||
&has_missing_table_file);
|
||||
Status s = versions_->TryRecoverFromOneManifest(
|
||||
manifest_path, column_families, read_only, &db_id,
|
||||
&has_missing_table_file);
|
||||
auto iter =
|
||||
std::find(cf_names.begin(), cf_names.end(), kDefaultColumnFamilyName);
|
||||
if (iter == cf_names.end()) {
|
||||
|
@ -3637,9 +3653,7 @@ TEST_P(VersionSetTestEmptyDb, OpenFromIncompleteManifest1) {
|
|||
ASSERT_OK(s);
|
||||
}
|
||||
log_writer_.reset();
|
||||
s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1,
|
||||
/* dir_contains_current_file */ nullptr);
|
||||
ASSERT_OK(s);
|
||||
CreateCurrentFile();
|
||||
|
||||
std::string manifest_path;
|
||||
VerifyManifest(&manifest_path);
|
||||
|
@ -3685,9 +3699,7 @@ TEST_P(VersionSetTestEmptyDb, OpenFromInCompleteManifest2) {
|
|||
ASSERT_OK(s);
|
||||
}
|
||||
log_writer_.reset();
|
||||
s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1,
|
||||
/* dir_contains_current_file */ nullptr);
|
||||
ASSERT_OK(s);
|
||||
CreateCurrentFile();
|
||||
|
||||
std::string manifest_path;
|
||||
VerifyManifest(&manifest_path);
|
||||
|
@ -3744,9 +3756,7 @@ TEST_P(VersionSetTestEmptyDb, OpenManifestWithUnknownCF) {
|
|||
ASSERT_OK(s);
|
||||
}
|
||||
log_writer_.reset();
|
||||
s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1,
|
||||
/* dir_contains_current_file */ nullptr);
|
||||
ASSERT_OK(s);
|
||||
CreateCurrentFile();
|
||||
|
||||
std::string manifest_path;
|
||||
VerifyManifest(&manifest_path);
|
||||
|
@ -3802,9 +3812,7 @@ TEST_P(VersionSetTestEmptyDb, OpenCompleteManifest) {
|
|||
ASSERT_OK(s);
|
||||
}
|
||||
log_writer_.reset();
|
||||
s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1,
|
||||
/* dir_contains_current_file */ nullptr);
|
||||
ASSERT_OK(s);
|
||||
CreateCurrentFile();
|
||||
|
||||
std::string manifest_path;
|
||||
VerifyManifest(&manifest_path);
|
||||
|
@ -3869,8 +3877,9 @@ INSTANTIATE_TEST_CASE_P(
|
|||
class VersionSetTestMissingFiles : public VersionSetTestBase,
|
||||
public testing::Test {
|
||||
public:
|
||||
VersionSetTestMissingFiles()
|
||||
: VersionSetTestBase("version_set_test_missing_files"),
|
||||
explicit VersionSetTestMissingFiles(
|
||||
const std::string& test_name = "version_set_test_missing_files")
|
||||
: VersionSetTestBase(test_name),
|
||||
internal_comparator_(
|
||||
std::make_shared<InternalKeyComparator>(options_.comparator)) {}
|
||||
|
||||
|
@ -3947,7 +3956,8 @@ class VersionSetTestMissingFiles : public VersionSetTestBase,
|
|||
// This method updates last_sequence_.
|
||||
void WriteFileAdditionAndDeletionToManifest(
|
||||
uint32_t cf, const std::vector<std::pair<int, FileMetaData>>& added_files,
|
||||
const std::vector<std::pair<int, uint64_t>>& deleted_files) {
|
||||
const std::vector<std::pair<int, uint64_t>>& deleted_files,
|
||||
const std::vector<BlobFileAddition>& blob_files = {}) {
|
||||
VersionEdit edit;
|
||||
edit.SetColumnFamily(cf);
|
||||
for (const auto& elem : added_files) {
|
||||
|
@ -3958,6 +3968,9 @@ class VersionSetTestMissingFiles : public VersionSetTestBase,
|
|||
int level = elem.first;
|
||||
edit.DeleteFile(level, elem.second);
|
||||
}
|
||||
for (const auto& elem : blob_files) {
|
||||
edit.AddBlobFile(elem);
|
||||
}
|
||||
edit.SetLastSequence(last_seqno_);
|
||||
++last_seqno_;
|
||||
assert(log_writer_.get() != nullptr);
|
||||
|
@ -4006,15 +4019,14 @@ TEST_F(VersionSetTestMissingFiles, ManifestFarBehindSst) {
|
|||
WriteFileAdditionAndDeletionToManifest(
|
||||
/*cf=*/0, std::vector<std::pair<int, FileMetaData>>(), deleted_files);
|
||||
log_writer_.reset();
|
||||
Status s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr);
|
||||
ASSERT_OK(s);
|
||||
CreateCurrentFile();
|
||||
std::string manifest_path;
|
||||
VerifyManifest(&manifest_path);
|
||||
std::string db_id;
|
||||
bool has_missing_table_file = false;
|
||||
s = versions_->TryRecoverFromOneManifest(manifest_path, column_families_,
|
||||
/*read_only=*/false, &db_id,
|
||||
&has_missing_table_file);
|
||||
Status s = versions_->TryRecoverFromOneManifest(
|
||||
manifest_path, column_families_,
|
||||
/*read_only=*/false, &db_id, &has_missing_table_file);
|
||||
ASSERT_OK(s);
|
||||
ASSERT_TRUE(has_missing_table_file);
|
||||
for (ColumnFamilyData* cfd : *(versions_->GetColumnFamilySet())) {
|
||||
|
@ -4064,15 +4076,14 @@ TEST_F(VersionSetTestMissingFiles, ManifestAheadofSst) {
|
|||
WriteFileAdditionAndDeletionToManifest(
|
||||
/*cf=*/0, added_files, std::vector<std::pair<int, uint64_t>>());
|
||||
log_writer_.reset();
|
||||
Status s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr);
|
||||
ASSERT_OK(s);
|
||||
CreateCurrentFile();
|
||||
std::string manifest_path;
|
||||
VerifyManifest(&manifest_path);
|
||||
std::string db_id;
|
||||
bool has_missing_table_file = false;
|
||||
s = versions_->TryRecoverFromOneManifest(manifest_path, column_families_,
|
||||
/*read_only=*/false, &db_id,
|
||||
&has_missing_table_file);
|
||||
Status s = versions_->TryRecoverFromOneManifest(
|
||||
manifest_path, column_families_,
|
||||
/*read_only=*/false, &db_id, &has_missing_table_file);
|
||||
ASSERT_OK(s);
|
||||
ASSERT_TRUE(has_missing_table_file);
|
||||
for (ColumnFamilyData* cfd : *(versions_->GetColumnFamilySet())) {
|
||||
|
@ -4118,15 +4129,14 @@ TEST_F(VersionSetTestMissingFiles, NoFileMissing) {
|
|||
WriteFileAdditionAndDeletionToManifest(
|
||||
/*cf=*/0, std::vector<std::pair<int, FileMetaData>>(), deleted_files);
|
||||
log_writer_.reset();
|
||||
Status s = SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1, nullptr);
|
||||
ASSERT_OK(s);
|
||||
CreateCurrentFile();
|
||||
std::string manifest_path;
|
||||
VerifyManifest(&manifest_path);
|
||||
std::string db_id;
|
||||
bool has_missing_table_file = false;
|
||||
s = versions_->TryRecoverFromOneManifest(manifest_path, column_families_,
|
||||
/*read_only=*/false, &db_id,
|
||||
&has_missing_table_file);
|
||||
Status s = versions_->TryRecoverFromOneManifest(
|
||||
manifest_path, column_families_,
|
||||
/*read_only=*/false, &db_id, &has_missing_table_file);
|
||||
ASSERT_OK(s);
|
||||
ASSERT_FALSE(has_missing_table_file);
|
||||
for (ColumnFamilyData* cfd : *(versions_->GetColumnFamilySet())) {
|
||||
|
@ -4171,6 +4181,250 @@ TEST_F(VersionSetTestMissingFiles, MinLogNumberToKeep2PC) {
|
|||
}
|
||||
}
|
||||
|
||||
class BestEffortsRecoverIncompleteVersionTest
|
||||
: public VersionSetTestMissingFiles {
|
||||
public:
|
||||
BestEffortsRecoverIncompleteVersionTest()
|
||||
: VersionSetTestMissingFiles("best_efforts_recover_incomplete_version") {}
|
||||
|
||||
struct BlobInfo {
|
||||
uint64_t file_number;
|
||||
bool file_missing;
|
||||
std::string key;
|
||||
std::string blob;
|
||||
BlobInfo(uint64_t _file_number, bool _file_missing, std::string _key,
|
||||
std::string _blob)
|
||||
: file_number(_file_number),
|
||||
file_missing(_file_missing),
|
||||
key(_key),
|
||||
blob(_blob) {}
|
||||
};
|
||||
|
||||
void CreateDummyBlobFiles(const std::vector<BlobInfo>& infos,
|
||||
std::vector<BlobFileAddition>* blob_metas) {
|
||||
for (const auto& info : infos) {
|
||||
if (!info.file_missing) {
|
||||
WriteDummyBlobFile(info.file_number, info.key, info.blob);
|
||||
}
|
||||
blob_metas->emplace_back(
|
||||
info.file_number, 1 /*total_blob_count*/,
|
||||
info.key.size() + info.blob.size() /*total_blob_bytes*/,
|
||||
"" /*checksum_method*/, "" /*check_sum_value*/);
|
||||
}
|
||||
}
|
||||
// Creates a test blob file that is valid so it can pass the
|
||||
// `VersionEditHandlerPointInTime::VerifyBlobFile` check.
|
||||
void WriteDummyBlobFile(uint64_t blob_file_number, const Slice& key,
|
||||
const Slice& blob) {
|
||||
ImmutableOptions options;
|
||||
std::string blob_file_path = BlobFileName(dbname_, blob_file_number);
|
||||
|
||||
std::unique_ptr<FSWritableFile> file;
|
||||
ASSERT_OK(
|
||||
fs_->NewWritableFile(blob_file_path, FileOptions(), &file, nullptr));
|
||||
|
||||
std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
|
||||
std::move(file), blob_file_path, FileOptions(), options.clock));
|
||||
|
||||
BlobLogWriter blob_log_writer(std::move(file_writer), options.clock,
|
||||
/*statistics*/ nullptr, blob_file_number,
|
||||
/*use_fsync*/ true,
|
||||
/*do_flush*/ false);
|
||||
|
||||
constexpr ExpirationRange expiration_range;
|
||||
BlobLogHeader header(/*column_family_id*/ 0, kNoCompression,
|
||||
/*has_ttl*/ false, expiration_range);
|
||||
ASSERT_OK(blob_log_writer.WriteHeader(WriteOptions(), header));
|
||||
std::string compressed_blob;
|
||||
uint64_t key_offset = 0;
|
||||
uint64_t blob_offset = 0;
|
||||
ASSERT_OK(blob_log_writer.AddRecord(WriteOptions(), key, blob, &key_offset,
|
||||
&blob_offset));
|
||||
BlobLogFooter footer;
|
||||
footer.blob_count = 1;
|
||||
footer.expiration_range = expiration_range;
|
||||
std::string checksum_method;
|
||||
std::string checksum_value;
|
||||
ASSERT_OK(blob_log_writer.AppendFooter(WriteOptions(), footer,
|
||||
&checksum_method, &checksum_value));
|
||||
}
|
||||
|
||||
void RecoverFromManifestWithMissingFiles(
|
||||
const std::vector<std::pair<int, FileMetaData>>& added_files,
|
||||
const std::vector<BlobFileAddition>& blob_files) {
|
||||
PrepareManifest(&column_families_, &last_seqno_, &log_writer_);
|
||||
WriteFileAdditionAndDeletionToManifest(
|
||||
/*cf=*/0, added_files, std::vector<std::pair<int, uint64_t>>(),
|
||||
blob_files);
|
||||
log_writer_.reset();
|
||||
CreateCurrentFile();
|
||||
std::string manifest_path;
|
||||
VerifyManifest(&manifest_path);
|
||||
std::string db_id;
|
||||
bool has_missing_table_file = false;
|
||||
Status s = versions_->TryRecoverFromOneManifest(
|
||||
manifest_path, column_families_,
|
||||
/*read_only=*/false, &db_id, &has_missing_table_file);
|
||||
ASSERT_OK(s);
|
||||
ASSERT_TRUE(has_missing_table_file);
|
||||
}
|
||||
};
|
||||
|
||||
TEST_F(BestEffortsRecoverIncompleteVersionTest, NonL0MissingFiles) {
|
||||
std::vector<SstInfo> sst_files = {
|
||||
SstInfo(100, kDefaultColumnFamilyName, "a", 1 /* level */,
|
||||
100 /* epoch_number */, true /* file_missing */),
|
||||
SstInfo(101, kDefaultColumnFamilyName, "a", 0 /* level */,
|
||||
101 /* epoch_number */, false /* file_missing */),
|
||||
SstInfo(102, kDefaultColumnFamilyName, "a", 0 /* level */,
|
||||
102 /* epoch_number */, false /* file_missing */),
|
||||
};
|
||||
std::vector<FileMetaData> file_metas;
|
||||
CreateDummyTableFiles(sst_files, &file_metas);
|
||||
|
||||
std::vector<std::pair<int, FileMetaData>> added_files;
|
||||
for (size_t i = 0; i < sst_files.size(); i++) {
|
||||
const auto& info = sst_files[i];
|
||||
const auto& meta = file_metas[i];
|
||||
added_files.emplace_back(info.level, meta);
|
||||
}
|
||||
RecoverFromManifestWithMissingFiles(added_files,
|
||||
std::vector<BlobFileAddition>());
|
||||
std::vector<uint64_t> all_table_files;
|
||||
std::vector<uint64_t> all_blob_files;
|
||||
versions_->AddLiveFiles(&all_table_files, &all_blob_files);
|
||||
ASSERT_TRUE(all_table_files.empty());
|
||||
}
|
||||
|
||||
TEST_F(BestEffortsRecoverIncompleteVersionTest, MissingNonSuffixL0Files) {
|
||||
std::vector<SstInfo> sst_files = {
|
||||
SstInfo(100, kDefaultColumnFamilyName, "a", 1 /* level */,
|
||||
100 /* epoch_number */, false /* file_missing */),
|
||||
SstInfo(101, kDefaultColumnFamilyName, "a", 0 /* level */,
|
||||
101 /* epoch_number */, true /* file_missing */),
|
||||
SstInfo(102, kDefaultColumnFamilyName, "a", 0 /* level */,
|
||||
102 /* epoch_number */, false /* file_missing */),
|
||||
};
|
||||
std::vector<FileMetaData> file_metas;
|
||||
CreateDummyTableFiles(sst_files, &file_metas);
|
||||
|
||||
std::vector<std::pair<int, FileMetaData>> added_files;
|
||||
for (size_t i = 0; i < sst_files.size(); i++) {
|
||||
const auto& info = sst_files[i];
|
||||
const auto& meta = file_metas[i];
|
||||
added_files.emplace_back(info.level, meta);
|
||||
}
|
||||
RecoverFromManifestWithMissingFiles(added_files,
|
||||
std::vector<BlobFileAddition>());
|
||||
std::vector<uint64_t> all_table_files;
|
||||
std::vector<uint64_t> all_blob_files;
|
||||
versions_->AddLiveFiles(&all_table_files, &all_blob_files);
|
||||
ASSERT_TRUE(all_table_files.empty());
|
||||
}
|
||||
|
||||
TEST_F(BestEffortsRecoverIncompleteVersionTest, MissingBlobFiles) {
|
||||
std::vector<SstInfo> sst_files = {
|
||||
SstInfo(100, kDefaultColumnFamilyName, "a", 0 /* level */,
|
||||
100 /* epoch_number */, false /* file_missing */,
|
||||
102 /*oldest_blob_file_number*/),
|
||||
SstInfo(101, kDefaultColumnFamilyName, "a", 0 /* level */,
|
||||
101 /* epoch_number */, false /* file_missing */,
|
||||
103 /*oldest_blob_file_number*/),
|
||||
};
|
||||
std::vector<FileMetaData> file_metas;
|
||||
CreateDummyTableFiles(sst_files, &file_metas);
|
||||
|
||||
std::vector<BlobInfo> blob_files = {
|
||||
BlobInfo(102, true /*file_missing*/, "a", "blob1"),
|
||||
BlobInfo(103, true /*file_missing*/, "a", "blob2"),
|
||||
};
|
||||
std::vector<BlobFileAddition> blob_meta;
|
||||
CreateDummyBlobFiles(blob_files, &blob_meta);
|
||||
|
||||
std::vector<std::pair<int, FileMetaData>> added_files;
|
||||
for (size_t i = 0; i < sst_files.size(); i++) {
|
||||
const auto& info = sst_files[i];
|
||||
const auto& meta = file_metas[i];
|
||||
added_files.emplace_back(info.level, meta);
|
||||
}
|
||||
RecoverFromManifestWithMissingFiles(added_files, blob_meta);
|
||||
std::vector<uint64_t> all_table_files;
|
||||
std::vector<uint64_t> all_blob_files;
|
||||
versions_->AddLiveFiles(&all_table_files, &all_blob_files);
|
||||
ASSERT_TRUE(all_table_files.empty());
|
||||
}
|
||||
|
||||
TEST_F(BestEffortsRecoverIncompleteVersionTest, MissingL0SuffixOnly) {
|
||||
std::vector<SstInfo> sst_files = {
|
||||
SstInfo(100, kDefaultColumnFamilyName, "a", 1 /* level */,
|
||||
100 /* epoch_number */, false /* file_missing */),
|
||||
SstInfo(101, kDefaultColumnFamilyName, "a", 0 /* level */,
|
||||
101 /* epoch_number */, false /* file_missing */),
|
||||
SstInfo(102, kDefaultColumnFamilyName, "a", 0 /* level */,
|
||||
102 /* epoch_number */, true /* file_missing */),
|
||||
};
|
||||
std::vector<FileMetaData> file_metas;
|
||||
CreateDummyTableFiles(sst_files, &file_metas);
|
||||
|
||||
std::vector<std::pair<int, FileMetaData>> added_files;
|
||||
for (size_t i = 0; i < sst_files.size(); i++) {
|
||||
const auto& info = sst_files[i];
|
||||
const auto& meta = file_metas[i];
|
||||
added_files.emplace_back(info.level, meta);
|
||||
}
|
||||
RecoverFromManifestWithMissingFiles(added_files,
|
||||
std::vector<BlobFileAddition>());
|
||||
std::vector<uint64_t> all_table_files;
|
||||
std::vector<uint64_t> all_blob_files;
|
||||
versions_->AddLiveFiles(&all_table_files, &all_blob_files);
|
||||
ASSERT_EQ(2, all_table_files.size());
|
||||
ColumnFamilyData* cfd = versions_->GetColumnFamilySet()->GetDefault();
|
||||
VersionStorageInfo* vstorage = cfd->current()->storage_info();
|
||||
ASSERT_EQ(1, vstorage->LevelFiles(0).size());
|
||||
ASSERT_EQ(1, vstorage->LevelFiles(1).size());
|
||||
}
|
||||
|
||||
TEST_F(BestEffortsRecoverIncompleteVersionTest,
|
||||
MissingL0SuffixAndTheirBlobFiles) {
|
||||
std::vector<SstInfo> sst_files = {
|
||||
SstInfo(100, kDefaultColumnFamilyName, "a", 1 /* level */,
|
||||
100 /* epoch_number */, false /* file_missing */),
|
||||
SstInfo(101, kDefaultColumnFamilyName, "a", 0 /* level */,
|
||||
101 /* epoch_number */, false /* file_missing */,
|
||||
103 /*oldest_blob_file_number*/),
|
||||
SstInfo(102, kDefaultColumnFamilyName, "a", 0 /* level */,
|
||||
102 /* epoch_number */, true /* file_missing */,
|
||||
104 /*oldest_blob_file_number*/),
|
||||
};
|
||||
std::vector<FileMetaData> file_metas;
|
||||
CreateDummyTableFiles(sst_files, &file_metas);
|
||||
|
||||
std::vector<BlobInfo> blob_files = {
|
||||
BlobInfo(103, false /*file_missing*/, "a", "blob1"),
|
||||
BlobInfo(104, true /*file_missing*/, "a", "blob2"),
|
||||
};
|
||||
std::vector<BlobFileAddition> blob_meta;
|
||||
CreateDummyBlobFiles(blob_files, &blob_meta);
|
||||
|
||||
std::vector<std::pair<int, FileMetaData>> added_files;
|
||||
for (size_t i = 0; i < sst_files.size(); i++) {
|
||||
const auto& info = sst_files[i];
|
||||
const auto& meta = file_metas[i];
|
||||
added_files.emplace_back(info.level, meta);
|
||||
}
|
||||
RecoverFromManifestWithMissingFiles(added_files, blob_meta);
|
||||
std::vector<uint64_t> all_table_files;
|
||||
std::vector<uint64_t> all_blob_files;
|
||||
versions_->AddLiveFiles(&all_table_files, &all_blob_files);
|
||||
ASSERT_EQ(2, all_table_files.size());
|
||||
ASSERT_EQ(1, all_blob_files.size());
|
||||
ColumnFamilyData* cfd = versions_->GetColumnFamilySet()->GetDefault();
|
||||
VersionStorageInfo* vstorage = cfd->current()->storage_info();
|
||||
ASSERT_EQ(1, vstorage->LevelFiles(0).size());
|
||||
ASSERT_EQ(1, vstorage->LevelFiles(1).size());
|
||||
ASSERT_EQ(1, vstorage->GetBlobFiles().size());
|
||||
}
|
||||
|
||||
class ChargeFileMetadataTest : public DBTestBase {
|
||||
public:
|
||||
ChargeFileMetadataTest()
|
||||
|
|
|
@ -929,15 +929,19 @@ Status WriteBatch::Put(ColumnFamilyHandle* column_family, const Slice& key,
|
|||
}
|
||||
|
||||
if (0 == ts_sz) {
|
||||
return WriteBatchInternal::Put(this, cf_id, key, value);
|
||||
s = WriteBatchInternal::Put(this, cf_id, key, value);
|
||||
} else {
|
||||
needs_in_place_update_ts_ = true;
|
||||
has_key_with_ts_ = true;
|
||||
std::string dummy_ts(ts_sz, '\0');
|
||||
std::array<Slice, 2> key_with_ts{{key, dummy_ts}};
|
||||
s = WriteBatchInternal::Put(this, cf_id, SliceParts(key_with_ts.data(), 2),
|
||||
SliceParts(&value, 1));
|
||||
}
|
||||
|
||||
needs_in_place_update_ts_ = true;
|
||||
has_key_with_ts_ = true;
|
||||
std::string dummy_ts(ts_sz, '\0');
|
||||
std::array<Slice, 2> key_with_ts{{key, dummy_ts}};
|
||||
return WriteBatchInternal::Put(this, cf_id, SliceParts(key_with_ts.data(), 2),
|
||||
SliceParts(&value, 1));
|
||||
if (s.ok()) {
|
||||
MaybeTrackTimestampSize(cf_id, ts_sz);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
Status WriteBatch::TimedPut(ColumnFamilyHandle* column_family, const Slice& key,
|
||||
|
@ -962,7 +966,7 @@ Status WriteBatch::TimedPut(ColumnFamilyHandle* column_family, const Slice& key,
|
|||
|
||||
Status WriteBatch::Put(ColumnFamilyHandle* column_family, const Slice& key,
|
||||
const Slice& ts, const Slice& value) {
|
||||
const Status s = CheckColumnFamilyTimestampSize(column_family, ts);
|
||||
Status s = CheckColumnFamilyTimestampSize(column_family, ts);
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
|
@ -970,8 +974,12 @@ Status WriteBatch::Put(ColumnFamilyHandle* column_family, const Slice& key,
|
|||
assert(column_family);
|
||||
uint32_t cf_id = column_family->GetID();
|
||||
std::array<Slice, 2> key_with_ts{{key, ts}};
|
||||
return WriteBatchInternal::Put(this, cf_id, SliceParts(key_with_ts.data(), 2),
|
||||
SliceParts(&value, 1));
|
||||
s = WriteBatchInternal::Put(this, cf_id, SliceParts(key_with_ts.data(), 2),
|
||||
SliceParts(&value, 1));
|
||||
if (s.ok()) {
|
||||
MaybeTrackTimestampSize(cf_id, ts.size());
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
Status WriteBatchInternal::CheckSlicePartsLength(const SliceParts& key,
|
||||
|
@ -1039,7 +1047,11 @@ Status WriteBatch::Put(ColumnFamilyHandle* column_family, const SliceParts& key,
|
|||
}
|
||||
|
||||
if (ts_sz == 0) {
|
||||
return WriteBatchInternal::Put(this, cf_id, key, value);
|
||||
s = WriteBatchInternal::Put(this, cf_id, key, value);
|
||||
if (s.ok()) {
|
||||
MaybeTrackTimestampSize(cf_id, ts_sz);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
return Status::InvalidArgument(
|
||||
|
@ -1246,20 +1258,24 @@ Status WriteBatch::Delete(ColumnFamilyHandle* column_family, const Slice& key) {
|
|||
}
|
||||
|
||||
if (0 == ts_sz) {
|
||||
return WriteBatchInternal::Delete(this, cf_id, key);
|
||||
s = WriteBatchInternal::Delete(this, cf_id, key);
|
||||
} else {
|
||||
needs_in_place_update_ts_ = true;
|
||||
has_key_with_ts_ = true;
|
||||
std::string dummy_ts(ts_sz, '\0');
|
||||
std::array<Slice, 2> key_with_ts{{key, dummy_ts}};
|
||||
s = WriteBatchInternal::Delete(this, cf_id,
|
||||
SliceParts(key_with_ts.data(), 2));
|
||||
}
|
||||
|
||||
needs_in_place_update_ts_ = true;
|
||||
has_key_with_ts_ = true;
|
||||
std::string dummy_ts(ts_sz, '\0');
|
||||
std::array<Slice, 2> key_with_ts{{key, dummy_ts}};
|
||||
return WriteBatchInternal::Delete(this, cf_id,
|
||||
SliceParts(key_with_ts.data(), 2));
|
||||
if (s.ok()) {
|
||||
MaybeTrackTimestampSize(cf_id, ts_sz);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
Status WriteBatch::Delete(ColumnFamilyHandle* column_family, const Slice& key,
|
||||
const Slice& ts) {
|
||||
const Status s = CheckColumnFamilyTimestampSize(column_family, ts);
|
||||
Status s = CheckColumnFamilyTimestampSize(column_family, ts);
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
|
@ -1267,8 +1283,12 @@ Status WriteBatch::Delete(ColumnFamilyHandle* column_family, const Slice& key,
|
|||
has_key_with_ts_ = true;
|
||||
uint32_t cf_id = column_family->GetID();
|
||||
std::array<Slice, 2> key_with_ts{{key, ts}};
|
||||
return WriteBatchInternal::Delete(this, cf_id,
|
||||
SliceParts(key_with_ts.data(), 2));
|
||||
s = WriteBatchInternal::Delete(this, cf_id,
|
||||
SliceParts(key_with_ts.data(), 2));
|
||||
if (s.ok()) {
|
||||
MaybeTrackTimestampSize(cf_id, ts.size());
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
Status WriteBatchInternal::Delete(WriteBatch* b, uint32_t column_family_id,
|
||||
|
@ -1313,7 +1333,11 @@ Status WriteBatch::Delete(ColumnFamilyHandle* column_family,
|
|||
}
|
||||
|
||||
if (0 == ts_sz) {
|
||||
return WriteBatchInternal::Delete(this, cf_id, key);
|
||||
s = WriteBatchInternal::Delete(this, cf_id, key);
|
||||
if (s.ok()) {
|
||||
MaybeTrackTimestampSize(cf_id, ts_sz);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
return Status::InvalidArgument(
|
||||
|
@ -1361,20 +1385,24 @@ Status WriteBatch::SingleDelete(ColumnFamilyHandle* column_family,
|
|||
}
|
||||
|
||||
if (0 == ts_sz) {
|
||||
return WriteBatchInternal::SingleDelete(this, cf_id, key);
|
||||
s = WriteBatchInternal::SingleDelete(this, cf_id, key);
|
||||
} else {
|
||||
needs_in_place_update_ts_ = true;
|
||||
has_key_with_ts_ = true;
|
||||
std::string dummy_ts(ts_sz, '\0');
|
||||
std::array<Slice, 2> key_with_ts{{key, dummy_ts}};
|
||||
s = WriteBatchInternal::SingleDelete(this, cf_id,
|
||||
SliceParts(key_with_ts.data(), 2));
|
||||
}
|
||||
|
||||
needs_in_place_update_ts_ = true;
|
||||
has_key_with_ts_ = true;
|
||||
std::string dummy_ts(ts_sz, '\0');
|
||||
std::array<Slice, 2> key_with_ts{{key, dummy_ts}};
|
||||
return WriteBatchInternal::SingleDelete(this, cf_id,
|
||||
SliceParts(key_with_ts.data(), 2));
|
||||
if (s.ok()) {
|
||||
MaybeTrackTimestampSize(cf_id, ts_sz);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
Status WriteBatch::SingleDelete(ColumnFamilyHandle* column_family,
|
||||
const Slice& key, const Slice& ts) {
|
||||
const Status s = CheckColumnFamilyTimestampSize(column_family, ts);
|
||||
Status s = CheckColumnFamilyTimestampSize(column_family, ts);
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
|
@ -1382,8 +1410,12 @@ Status WriteBatch::SingleDelete(ColumnFamilyHandle* column_family,
|
|||
assert(column_family);
|
||||
uint32_t cf_id = column_family->GetID();
|
||||
std::array<Slice, 2> key_with_ts{{key, ts}};
|
||||
return WriteBatchInternal::SingleDelete(this, cf_id,
|
||||
SliceParts(key_with_ts.data(), 2));
|
||||
s = WriteBatchInternal::SingleDelete(this, cf_id,
|
||||
SliceParts(key_with_ts.data(), 2));
|
||||
if (s.ok()) {
|
||||
MaybeTrackTimestampSize(cf_id, ts.size());
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
Status WriteBatchInternal::SingleDelete(WriteBatch* b,
|
||||
|
@ -1430,7 +1462,11 @@ Status WriteBatch::SingleDelete(ColumnFamilyHandle* column_family,
|
|||
}
|
||||
|
||||
if (0 == ts_sz) {
|
||||
return WriteBatchInternal::SingleDelete(this, cf_id, key);
|
||||
s = WriteBatchInternal::SingleDelete(this, cf_id, key);
|
||||
if (s.ok()) {
|
||||
MaybeTrackTimestampSize(cf_id, ts_sz);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
return Status::InvalidArgument(
|
||||
|
@ -1480,23 +1516,27 @@ Status WriteBatch::DeleteRange(ColumnFamilyHandle* column_family,
|
|||
}
|
||||
|
||||
if (0 == ts_sz) {
|
||||
return WriteBatchInternal::DeleteRange(this, cf_id, begin_key, end_key);
|
||||
s = WriteBatchInternal::DeleteRange(this, cf_id, begin_key, end_key);
|
||||
} else {
|
||||
needs_in_place_update_ts_ = true;
|
||||
has_key_with_ts_ = true;
|
||||
std::string dummy_ts(ts_sz, '\0');
|
||||
std::array<Slice, 2> begin_key_with_ts{{begin_key, dummy_ts}};
|
||||
std::array<Slice, 2> end_key_with_ts{{end_key, dummy_ts}};
|
||||
s = WriteBatchInternal::DeleteRange(this, cf_id,
|
||||
SliceParts(begin_key_with_ts.data(), 2),
|
||||
SliceParts(end_key_with_ts.data(), 2));
|
||||
}
|
||||
|
||||
needs_in_place_update_ts_ = true;
|
||||
has_key_with_ts_ = true;
|
||||
std::string dummy_ts(ts_sz, '\0');
|
||||
std::array<Slice, 2> begin_key_with_ts{{begin_key, dummy_ts}};
|
||||
std::array<Slice, 2> end_key_with_ts{{end_key, dummy_ts}};
|
||||
return WriteBatchInternal::DeleteRange(
|
||||
this, cf_id, SliceParts(begin_key_with_ts.data(), 2),
|
||||
SliceParts(end_key_with_ts.data(), 2));
|
||||
if (s.ok()) {
|
||||
MaybeTrackTimestampSize(cf_id, ts_sz);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
Status WriteBatch::DeleteRange(ColumnFamilyHandle* column_family,
|
||||
const Slice& begin_key, const Slice& end_key,
|
||||
const Slice& ts) {
|
||||
const Status s = CheckColumnFamilyTimestampSize(column_family, ts);
|
||||
Status s = CheckColumnFamilyTimestampSize(column_family, ts);
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
|
@ -1505,9 +1545,13 @@ Status WriteBatch::DeleteRange(ColumnFamilyHandle* column_family,
|
|||
uint32_t cf_id = column_family->GetID();
|
||||
std::array<Slice, 2> key_with_ts{{begin_key, ts}};
|
||||
std::array<Slice, 2> end_key_with_ts{{end_key, ts}};
|
||||
return WriteBatchInternal::DeleteRange(this, cf_id,
|
||||
SliceParts(key_with_ts.data(), 2),
|
||||
SliceParts(end_key_with_ts.data(), 2));
|
||||
s = WriteBatchInternal::DeleteRange(this, cf_id,
|
||||
SliceParts(key_with_ts.data(), 2),
|
||||
SliceParts(end_key_with_ts.data(), 2));
|
||||
if (s.ok()) {
|
||||
MaybeTrackTimestampSize(cf_id, ts.size());
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
Status WriteBatchInternal::DeleteRange(WriteBatch* b, uint32_t column_family_id,
|
||||
|
@ -1554,7 +1598,11 @@ Status WriteBatch::DeleteRange(ColumnFamilyHandle* column_family,
|
|||
}
|
||||
|
||||
if (0 == ts_sz) {
|
||||
return WriteBatchInternal::DeleteRange(this, cf_id, begin_key, end_key);
|
||||
s = WriteBatchInternal::DeleteRange(this, cf_id, begin_key, end_key);
|
||||
if (s.ok()) {
|
||||
MaybeTrackTimestampSize(cf_id, ts_sz);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
return Status::InvalidArgument(
|
||||
|
@ -1608,21 +1656,25 @@ Status WriteBatch::Merge(ColumnFamilyHandle* column_family, const Slice& key,
|
|||
}
|
||||
|
||||
if (0 == ts_sz) {
|
||||
return WriteBatchInternal::Merge(this, cf_id, key, value);
|
||||
s = WriteBatchInternal::Merge(this, cf_id, key, value);
|
||||
} else {
|
||||
needs_in_place_update_ts_ = true;
|
||||
has_key_with_ts_ = true;
|
||||
std::string dummy_ts(ts_sz, '\0');
|
||||
std::array<Slice, 2> key_with_ts{{key, dummy_ts}};
|
||||
|
||||
s = WriteBatchInternal::Merge(
|
||||
this, cf_id, SliceParts(key_with_ts.data(), 2), SliceParts(&value, 1));
|
||||
}
|
||||
|
||||
needs_in_place_update_ts_ = true;
|
||||
has_key_with_ts_ = true;
|
||||
std::string dummy_ts(ts_sz, '\0');
|
||||
std::array<Slice, 2> key_with_ts{{key, dummy_ts}};
|
||||
|
||||
return WriteBatchInternal::Merge(
|
||||
this, cf_id, SliceParts(key_with_ts.data(), 2), SliceParts(&value, 1));
|
||||
if (s.ok()) {
|
||||
MaybeTrackTimestampSize(cf_id, ts_sz);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
Status WriteBatch::Merge(ColumnFamilyHandle* column_family, const Slice& key,
|
||||
const Slice& ts, const Slice& value) {
|
||||
const Status s = CheckColumnFamilyTimestampSize(column_family, ts);
|
||||
Status s = CheckColumnFamilyTimestampSize(column_family, ts);
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
|
@ -1630,8 +1682,12 @@ Status WriteBatch::Merge(ColumnFamilyHandle* column_family, const Slice& key,
|
|||
assert(column_family);
|
||||
uint32_t cf_id = column_family->GetID();
|
||||
std::array<Slice, 2> key_with_ts{{key, ts}};
|
||||
return WriteBatchInternal::Merge(
|
||||
this, cf_id, SliceParts(key_with_ts.data(), 2), SliceParts(&value, 1));
|
||||
s = WriteBatchInternal::Merge(this, cf_id, SliceParts(key_with_ts.data(), 2),
|
||||
SliceParts(&value, 1));
|
||||
if (s.ok()) {
|
||||
MaybeTrackTimestampSize(cf_id, ts.size());
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
Status WriteBatchInternal::Merge(WriteBatch* b, uint32_t column_family_id,
|
||||
|
@ -1680,7 +1736,11 @@ Status WriteBatch::Merge(ColumnFamilyHandle* column_family,
|
|||
}
|
||||
|
||||
if (0 == ts_sz) {
|
||||
return WriteBatchInternal::Merge(this, cf_id, key, value);
|
||||
s = WriteBatchInternal::Merge(this, cf_id, key, value);
|
||||
if (s.ok()) {
|
||||
MaybeTrackTimestampSize(cf_id, ts_sz);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
return Status::InvalidArgument(
|
||||
|
|
|
@ -125,7 +125,6 @@ DECLARE_int32(level0_stop_writes_trigger);
|
|||
DECLARE_int32(block_size);
|
||||
DECLARE_int32(format_version);
|
||||
DECLARE_int32(index_block_restart_interval);
|
||||
DECLARE_bool(disable_auto_compactions);
|
||||
DECLARE_int32(max_background_compactions);
|
||||
DECLARE_int32(num_bottom_pri_threads);
|
||||
DECLARE_int32(compaction_thread_pool_adjust_interval);
|
||||
|
@ -151,6 +150,7 @@ DECLARE_bool(charge_filter_construction);
|
|||
DECLARE_bool(charge_table_reader);
|
||||
DECLARE_bool(charge_file_metadata);
|
||||
DECLARE_bool(charge_blob_cache);
|
||||
DECLARE_bool(decouple_partitioned_filters);
|
||||
DECLARE_int32(top_level_index_pinning);
|
||||
DECLARE_int32(partition_pinning);
|
||||
DECLARE_int32(unpartitioned_pinning);
|
||||
|
@ -274,6 +274,7 @@ DECLARE_bool(verification_only);
|
|||
DECLARE_string(last_level_temperature);
|
||||
DECLARE_string(default_write_temperature);
|
||||
DECLARE_string(default_temperature);
|
||||
DECLARE_bool(paranoid_memory_checks);
|
||||
|
||||
// Options for transaction dbs.
|
||||
// Use TransactionDB (a.k.a. Pessimistic Transaction DB)
|
||||
|
@ -318,7 +319,6 @@ DECLARE_int32(prepopulate_blob_cache);
|
|||
DECLARE_int32(approximate_size_one_in);
|
||||
DECLARE_bool(best_efforts_recovery);
|
||||
DECLARE_bool(skip_verifydb);
|
||||
DECLARE_bool(enable_compaction_filter);
|
||||
DECLARE_bool(paranoid_file_checks);
|
||||
DECLARE_bool(fail_if_options_file_error);
|
||||
DECLARE_uint64(batch_protection_bytes_per_key);
|
||||
|
|
|
@ -49,7 +49,7 @@ class DbStressCompactionFilter : public CompactionFilter {
|
|||
return Decision::kKeep;
|
||||
}
|
||||
// Reaching here means we acquired the lock.
|
||||
|
||||
key_mutex->AssertHeld();
|
||||
bool key_exists = state_->Exists(cf_id_, key_num);
|
||||
const bool allow_overwrite = state_->AllowsOverwrite(key_num);
|
||||
|
||||
|
|
|
@ -167,7 +167,10 @@ bool RunStressTestImpl(SharedState* shared) {
|
|||
{FileType::kWalFile});
|
||||
}
|
||||
}
|
||||
now = clock->NowMicros();
|
||||
if (ShouldDisableAutoCompactionsBeforeVerifyDb()) {
|
||||
Status s = stress->EnableAutoCompaction();
|
||||
assert(s.ok());
|
||||
}
|
||||
fprintf(stdout, "%s Starting database operations\n",
|
||||
clock->TimeToString(now / 1000000).c_str());
|
||||
|
||||
|
|
|
@ -380,6 +380,11 @@ DEFINE_bool(charge_blob_cache, false,
|
|||
"CacheEntryRoleOptions::charged of "
|
||||
"kBlobCache");
|
||||
|
||||
DEFINE_bool(
|
||||
decouple_partitioned_filters,
|
||||
ROCKSDB_NAMESPACE::BlockBasedTableOptions().decouple_partitioned_filters,
|
||||
"Decouple filter partitioning from index partitioning.");
|
||||
|
||||
DEFINE_int32(
|
||||
top_level_index_pinning,
|
||||
static_cast<int32_t>(ROCKSDB_NAMESPACE::PinningTier::kFallback),
|
||||
|
@ -1443,4 +1448,8 @@ DEFINE_uint32(uncache_aggressiveness,
|
|||
"obsolete. 0 = disabled, 1 = minimum, 100 = moderate, 10000 = "
|
||||
"normal max");
|
||||
|
||||
DEFINE_bool(paranoid_memory_checks,
|
||||
ROCKSDB_NAMESPACE::Options().paranoid_memory_checks,
|
||||
"Sets CF option paranoid_memory_checks.");
|
||||
|
||||
#endif // GFLAGS
|
||||
|
|
|
@ -45,6 +45,8 @@ DECLARE_int32(open_write_fault_one_in);
|
|||
DECLARE_int32(open_read_fault_one_in);
|
||||
|
||||
DECLARE_int32(inject_error_severity);
|
||||
DECLARE_bool(disable_auto_compactions);
|
||||
DECLARE_bool(enable_compaction_filter);
|
||||
|
||||
namespace ROCKSDB_NAMESPACE {
|
||||
class StressTest;
|
||||
|
@ -262,14 +264,10 @@ class SharedState {
|
|||
// This is useful for crash-recovery testing when the process may crash
|
||||
// before updating the corresponding expected value
|
||||
//
|
||||
// It can fail and `*prepared` will be set to false if the previous write or
|
||||
// delete is still in pending state (e.g, still in recovery for retryable IO
|
||||
// errors). If succeeds,`*prepared` will be set to true
|
||||
//
|
||||
// Requires external locking covering `key` in `cf` to prevent
|
||||
// concurrent write or delete to the same `key`.
|
||||
PendingExpectedValue PreparePut(int cf, int64_t key, bool* prepared) {
|
||||
return expected_state_manager_->PreparePut(cf, key, prepared);
|
||||
PendingExpectedValue PreparePut(int cf, int64_t key) {
|
||||
return expected_state_manager_->PreparePut(cf, key);
|
||||
}
|
||||
|
||||
// Does not requires external locking.
|
||||
|
@ -281,31 +279,24 @@ class SharedState {
|
|||
// This is useful for crash-recovery testing when the process may crash
|
||||
// before updating the corresponding expected value
|
||||
//
|
||||
// It can fail and `*prepared` will be set to false if the previous write or
|
||||
// delete is still in pending state (e.g, still in recovery for retryable IO
|
||||
// errors). If succeeds,`*prepared` will be set to true
|
||||
//
|
||||
// Requires external locking covering `key` in `cf` to prevent concurrent
|
||||
// write or delete to the same `key`.
|
||||
PendingExpectedValue PrepareDelete(int cf, int64_t key, bool* prepared) {
|
||||
return expected_state_manager_->PrepareDelete(cf, key, prepared);
|
||||
PendingExpectedValue PrepareDelete(int cf, int64_t key) {
|
||||
return expected_state_manager_->PrepareDelete(cf, key);
|
||||
}
|
||||
|
||||
// Requires external locking covering `key` in `cf` to prevent concurrent
|
||||
// write or delete to the same `key`.
|
||||
PendingExpectedValue PrepareSingleDelete(int cf, int64_t key,
|
||||
bool* prepared) {
|
||||
return expected_state_manager_->PrepareSingleDelete(cf, key, prepared);
|
||||
PendingExpectedValue PrepareSingleDelete(int cf, int64_t key) {
|
||||
return expected_state_manager_->PrepareSingleDelete(cf, key);
|
||||
}
|
||||
|
||||
// Requires external locking covering keys in `[begin_key, end_key)` in `cf`
|
||||
// to prevent concurrent write or delete to the same `key`.
|
||||
std::vector<PendingExpectedValue> PrepareDeleteRange(int cf,
|
||||
int64_t begin_key,
|
||||
int64_t end_key,
|
||||
bool* prepared) {
|
||||
return expected_state_manager_->PrepareDeleteRange(cf, begin_key, end_key,
|
||||
prepared);
|
||||
int64_t end_key) {
|
||||
return expected_state_manager_->PrepareDeleteRange(cf, begin_key, end_key);
|
||||
}
|
||||
|
||||
bool AllowsOverwrite(int64_t key) const {
|
||||
|
|
|
@ -632,10 +632,8 @@ void StressTest::PreloadDbAndReopenAsReadOnly(int64_t number_of_keys,
|
|||
for (auto cfh : column_families_) {
|
||||
for (int64_t k = 0; k != number_of_keys; ++k) {
|
||||
const std::string key = Key(k);
|
||||
bool prepare = false;
|
||||
PendingExpectedValue pending_expected_value =
|
||||
shared->PreparePut(cf_idx, k, &prepare);
|
||||
assert(prepare);
|
||||
shared->PreparePut(cf_idx, k);
|
||||
const uint32_t value_base = pending_expected_value.GetFinalValueBase();
|
||||
const size_t sz = GenerateValue(value_base, value, sizeof(value));
|
||||
|
||||
|
@ -3676,7 +3674,7 @@ void StressTest::Reopen(ThreadState* thread) {
|
|||
// crash-recovery verification does. Therefore it always expects no data loss
|
||||
// and we should ensure no data loss in testing.
|
||||
// TODO(hx235): eliminate the FlushWAL(true /* sync */)/SyncWAL() below
|
||||
if (!FLAGS_disable_wal && !FLAGS_avoid_flush_during_shutdown) {
|
||||
if (!FLAGS_disable_wal && FLAGS_avoid_flush_during_shutdown) {
|
||||
Status s;
|
||||
if (FLAGS_manual_wal_flush_one_in > 0) {
|
||||
s = db_->FlushWAL(/*sync=*/true);
|
||||
|
@ -3834,6 +3832,10 @@ void CheckAndSetOptionsForUserTimestamp(Options& options) {
|
|||
FLAGS_persist_user_defined_timestamps;
|
||||
}
|
||||
|
||||
bool ShouldDisableAutoCompactionsBeforeVerifyDb() {
|
||||
return !FLAGS_disable_auto_compactions && FLAGS_enable_compaction_filter;
|
||||
}
|
||||
|
||||
bool InitializeOptionsFromFile(Options& options) {
|
||||
DBOptions db_options;
|
||||
ConfigOptions config_options;
|
||||
|
@ -3861,6 +3863,8 @@ void InitializeOptionsFromFlags(
|
|||
const std::shared_ptr<const FilterPolicy>& filter_policy,
|
||||
Options& options) {
|
||||
BlockBasedTableOptions block_based_options;
|
||||
block_based_options.decouple_partitioned_filters =
|
||||
FLAGS_decouple_partitioned_filters;
|
||||
block_based_options.block_cache = cache;
|
||||
block_based_options.cache_index_and_filter_blocks =
|
||||
FLAGS_cache_index_and_filter_blocks;
|
||||
|
@ -3947,7 +3951,11 @@ void InitializeOptionsFromFlags(
|
|||
new WriteBufferManager(FLAGS_db_write_buffer_size, block_cache));
|
||||
}
|
||||
options.memtable_whole_key_filtering = FLAGS_memtable_whole_key_filtering;
|
||||
options.disable_auto_compactions = FLAGS_disable_auto_compactions;
|
||||
if (ShouldDisableAutoCompactionsBeforeVerifyDb()) {
|
||||
options.disable_auto_compactions = true;
|
||||
} else {
|
||||
options.disable_auto_compactions = FLAGS_disable_auto_compactions;
|
||||
}
|
||||
options.max_background_compactions = FLAGS_max_background_compactions;
|
||||
options.max_background_flushes = FLAGS_max_background_flushes;
|
||||
options.compaction_style =
|
||||
|
@ -4047,6 +4055,7 @@ void InitializeOptionsFromFlags(
|
|||
options.memtable_protection_bytes_per_key =
|
||||
FLAGS_memtable_protection_bytes_per_key;
|
||||
options.block_protection_bytes_per_key = FLAGS_block_protection_bytes_per_key;
|
||||
options.paranoid_memory_checks = FLAGS_paranoid_memory_checks;
|
||||
|
||||
// Integrated BlobDB
|
||||
options.enable_blob_files = FLAGS_enable_blob_files;
|
||||
|
@ -4262,6 +4271,7 @@ void InitializeOptionsGeneral(
|
|||
options.disable_auto_compactions = true;
|
||||
}
|
||||
|
||||
options.table_properties_collector_factories.clear();
|
||||
options.table_properties_collector_factories.emplace_back(
|
||||
std::make_shared<DbStressTablePropertiesCollectorFactory>());
|
||||
|
||||
|
|
|
@ -48,7 +48,11 @@ class StressTest {
|
|||
return FLAGS_sync_fault_injection || FLAGS_disable_wal ||
|
||||
FLAGS_manual_wal_flush_one_in > 0;
|
||||
}
|
||||
|
||||
Status EnableAutoCompaction() {
|
||||
assert(options_.disable_auto_compactions);
|
||||
Status s = db_->EnableAutoCompaction(column_families_);
|
||||
return s;
|
||||
}
|
||||
void CleanUp();
|
||||
|
||||
protected:
|
||||
|
@ -64,6 +68,42 @@ class StressTest {
|
|||
}
|
||||
}
|
||||
|
||||
void UpdateIfInitialWriteFails(Env* db_stress_env, const Status& write_s,
|
||||
Status* initial_write_s,
|
||||
bool* initial_wal_write_may_succeed,
|
||||
uint64_t* wait_for_recover_start_time) {
|
||||
assert(db_stress_env && initial_write_s && initial_wal_write_may_succeed &&
|
||||
wait_for_recover_start_time);
|
||||
// Only update `initial_write_s`, `initial_wal_write_may_succeed` when the
|
||||
// first write fails
|
||||
if (!write_s.ok() && (*initial_write_s).ok()) {
|
||||
*initial_write_s = write_s;
|
||||
*initial_wal_write_may_succeed =
|
||||
!FaultInjectionTestFS::IsFailedToWriteToWALError(*initial_write_s);
|
||||
*wait_for_recover_start_time = db_stress_env->NowMicros();
|
||||
}
|
||||
}
|
||||
|
||||
void PrintWriteRecoveryWaitTimeIfNeeded(Env* db_stress_env,
|
||||
const Status& initial_write_s,
|
||||
bool initial_wal_write_may_succeed,
|
||||
uint64_t wait_for_recover_start_time,
|
||||
const std::string& thread_name) {
|
||||
assert(db_stress_env);
|
||||
bool waited_for_recovery = !initial_write_s.ok() &&
|
||||
IsErrorInjectedAndRetryable(initial_write_s) &&
|
||||
initial_wal_write_may_succeed;
|
||||
if (waited_for_recovery) {
|
||||
uint64_t elapsed_sec =
|
||||
(db_stress_env->NowMicros() - wait_for_recover_start_time) / 1000000;
|
||||
if (elapsed_sec > 10) {
|
||||
fprintf(stdout,
|
||||
"%s thread slept to wait for write recovery for "
|
||||
"%" PRIu64 " seconds\n",
|
||||
thread_name.c_str(), elapsed_sec);
|
||||
}
|
||||
}
|
||||
}
|
||||
void GetDeleteRangeKeyLocks(
|
||||
ThreadState* thread, int rand_column_family, int64_t rand_key,
|
||||
std::vector<std::unique_ptr<MutexLock>>* range_locks) {
|
||||
|
@ -411,5 +451,6 @@ void InitializeOptionsGeneral(
|
|||
// user-defined timestamp.
|
||||
void CheckAndSetOptionsForUserTimestamp(Options& options);
|
||||
|
||||
bool ShouldDisableAutoCompactionsBeforeVerifyDb();
|
||||
} // namespace ROCKSDB_NAMESPACE
|
||||
#endif // GFLAGS
|
||||
|
|
|
@ -32,41 +32,29 @@ void ExpectedState::Precommit(int cf, int64_t key, const ExpectedValue& value) {
|
|||
std::atomic_thread_fence(std::memory_order_release);
|
||||
}
|
||||
|
||||
PendingExpectedValue ExpectedState::PreparePut(int cf, int64_t key,
|
||||
bool* prepared) {
|
||||
assert(prepared);
|
||||
PendingExpectedValue ExpectedState::PreparePut(int cf, int64_t key) {
|
||||
ExpectedValue expected_value = Load(cf, key);
|
||||
|
||||
// Calculate the original expected value
|
||||
const ExpectedValue orig_expected_value = expected_value;
|
||||
|
||||
// Calculate the pending expected value
|
||||
bool res = expected_value.Put(true /* pending */);
|
||||
if (!res) {
|
||||
PendingExpectedValue ret = PendingExpectedValue(
|
||||
&Value(cf, key), orig_expected_value, orig_expected_value);
|
||||
*prepared = false;
|
||||
return ret;
|
||||
}
|
||||
expected_value.Put(true /* pending */);
|
||||
const ExpectedValue pending_expected_value = expected_value;
|
||||
|
||||
// Calculate the final expected value
|
||||
res = expected_value.Put(false /* pending */);
|
||||
assert(res);
|
||||
expected_value.Put(false /* pending */);
|
||||
const ExpectedValue final_expected_value = expected_value;
|
||||
|
||||
// Precommit
|
||||
Precommit(cf, key, pending_expected_value);
|
||||
*prepared = true;
|
||||
return PendingExpectedValue(&Value(cf, key), orig_expected_value,
|
||||
final_expected_value);
|
||||
}
|
||||
|
||||
ExpectedValue ExpectedState::Get(int cf, int64_t key) { return Load(cf, key); }
|
||||
|
||||
PendingExpectedValue ExpectedState::PrepareDelete(int cf, int64_t key,
|
||||
bool* prepared) {
|
||||
assert(prepared);
|
||||
PendingExpectedValue ExpectedState::PrepareDelete(int cf, int64_t key) {
|
||||
ExpectedValue expected_value = Load(cf, key);
|
||||
|
||||
// Calculate the original expected value
|
||||
|
@ -77,47 +65,32 @@ PendingExpectedValue ExpectedState::PrepareDelete(int cf, int64_t key,
|
|||
if (!res) {
|
||||
PendingExpectedValue ret = PendingExpectedValue(
|
||||
&Value(cf, key), orig_expected_value, orig_expected_value);
|
||||
*prepared = false;
|
||||
return ret;
|
||||
}
|
||||
const ExpectedValue pending_expected_value = expected_value;
|
||||
|
||||
// Calculate the final expected value
|
||||
res = expected_value.Delete(false /* pending */);
|
||||
assert(res);
|
||||
expected_value.Delete(false /* pending */);
|
||||
const ExpectedValue final_expected_value = expected_value;
|
||||
|
||||
// Precommit
|
||||
Precommit(cf, key, pending_expected_value);
|
||||
*prepared = true;
|
||||
return PendingExpectedValue(&Value(cf, key), orig_expected_value,
|
||||
final_expected_value);
|
||||
}
|
||||
|
||||
PendingExpectedValue ExpectedState::PrepareSingleDelete(int cf, int64_t key,
|
||||
bool* prepared) {
|
||||
return PrepareDelete(cf, key, prepared);
|
||||
PendingExpectedValue ExpectedState::PrepareSingleDelete(int cf, int64_t key) {
|
||||
return PrepareDelete(cf, key);
|
||||
}
|
||||
|
||||
std::vector<PendingExpectedValue> ExpectedState::PrepareDeleteRange(
|
||||
int cf, int64_t begin_key, int64_t end_key, bool* prepared) {
|
||||
int cf, int64_t begin_key, int64_t end_key) {
|
||||
std::vector<PendingExpectedValue> pending_expected_values;
|
||||
bool has_prepared_failed = false;
|
||||
|
||||
for (int64_t key = begin_key; key < end_key; ++key) {
|
||||
bool each_prepared = false;
|
||||
PendingExpectedValue pending_expected_value =
|
||||
PrepareDelete(cf, key, &each_prepared);
|
||||
if (each_prepared) {
|
||||
pending_expected_values.push_back(pending_expected_value);
|
||||
} else {
|
||||
has_prepared_failed = true;
|
||||
pending_expected_value.PermitUnclosedPendingState();
|
||||
break;
|
||||
}
|
||||
pending_expected_values.push_back(PrepareDelete(cf, key));
|
||||
}
|
||||
|
||||
*prepared = !has_prepared_failed;
|
||||
return pending_expected_values;
|
||||
}
|
||||
|
||||
|
@ -759,8 +732,31 @@ Status FileExpectedStateManager::Restore(DB* db) {
|
|||
s = Env::Default()->DeleteFile(state_file_path);
|
||||
}
|
||||
if (s.ok()) {
|
||||
saved_seqno_ = kMaxSequenceNumber;
|
||||
s = Env::Default()->DeleteFile(trace_file_path);
|
||||
std::vector<std::string> expected_state_dir_children;
|
||||
s = Env::Default()->GetChildren(expected_state_dir_path_,
|
||||
&expected_state_dir_children);
|
||||
if (s.ok()) {
|
||||
for (size_t i = 0; i < expected_state_dir_children.size(); ++i) {
|
||||
const auto& filename = expected_state_dir_children[i];
|
||||
if (filename.size() >= kTraceFilenameSuffix.size() &&
|
||||
filename.rfind(kTraceFilenameSuffix) ==
|
||||
filename.size() - kTraceFilenameSuffix.size()) {
|
||||
SequenceNumber found_seqno = ParseUint64(filename.substr(
|
||||
0, filename.size() - kTraceFilenameSuffix.size()));
|
||||
// Delete older trace files, but keep the one we just replayed for
|
||||
// debugging purposes
|
||||
if (found_seqno < saved_seqno_) {
|
||||
s = Env::Default()->DeleteFile(GetPathForFilename(filename));
|
||||
}
|
||||
}
|
||||
if (!s.ok()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (s.ok()) {
|
||||
saved_seqno_ = kMaxSequenceNumber;
|
||||
}
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
|
|
@ -44,7 +44,7 @@ class ExpectedState {
|
|||
//
|
||||
// Requires external locking covering `key` in `cf` to prevent concurrent
|
||||
// write or delete to the same `key`.
|
||||
PendingExpectedValue PreparePut(int cf, int64_t key, bool* prepared);
|
||||
PendingExpectedValue PreparePut(int cf, int64_t key);
|
||||
|
||||
// Does not requires external locking.
|
||||
ExpectedValue Get(int cf, int64_t key);
|
||||
|
@ -55,18 +55,17 @@ class ExpectedState {
|
|||
//
|
||||
// Requires external locking covering `key` in `cf` to prevent concurrent
|
||||
// write or delete to the same `key`.
|
||||
PendingExpectedValue PrepareDelete(int cf, int64_t key, bool* prepared);
|
||||
PendingExpectedValue PrepareDelete(int cf, int64_t key);
|
||||
|
||||
// Requires external locking covering `key` in `cf` to prevent concurrent
|
||||
// write or delete to the same `key`.
|
||||
PendingExpectedValue PrepareSingleDelete(int cf, int64_t key, bool* prepared);
|
||||
PendingExpectedValue PrepareSingleDelete(int cf, int64_t key);
|
||||
|
||||
// Requires external locking covering keys in `[begin_key, end_key)` in `cf`
|
||||
// to prevent concurrent write or delete to the same `key`.
|
||||
std::vector<PendingExpectedValue> PrepareDeleteRange(int cf,
|
||||
int64_t begin_key,
|
||||
int64_t end_key,
|
||||
bool* prepared);
|
||||
int64_t end_key);
|
||||
|
||||
// Update the expected value for start of an incomplete write or delete
|
||||
// operation on the key assoicated with this expected value
|
||||
|
@ -197,30 +196,28 @@ class ExpectedStateManager {
|
|||
void ClearColumnFamily(int cf) { return latest_->ClearColumnFamily(cf); }
|
||||
|
||||
// See ExpectedState::PreparePut()
|
||||
PendingExpectedValue PreparePut(int cf, int64_t key, bool* prepared) {
|
||||
return latest_->PreparePut(cf, key, prepared);
|
||||
PendingExpectedValue PreparePut(int cf, int64_t key) {
|
||||
return latest_->PreparePut(cf, key);
|
||||
}
|
||||
|
||||
// See ExpectedState::Get()
|
||||
ExpectedValue Get(int cf, int64_t key) { return latest_->Get(cf, key); }
|
||||
|
||||
// See ExpectedState::PrepareDelete()
|
||||
PendingExpectedValue PrepareDelete(int cf, int64_t key, bool* prepared) {
|
||||
return latest_->PrepareDelete(cf, key, prepared);
|
||||
PendingExpectedValue PrepareDelete(int cf, int64_t key) {
|
||||
return latest_->PrepareDelete(cf, key);
|
||||
}
|
||||
|
||||
// See ExpectedState::PrepareSingleDelete()
|
||||
PendingExpectedValue PrepareSingleDelete(int cf, int64_t key,
|
||||
bool* prepared) {
|
||||
return latest_->PrepareSingleDelete(cf, key, prepared);
|
||||
PendingExpectedValue PrepareSingleDelete(int cf, int64_t key) {
|
||||
return latest_->PrepareSingleDelete(cf, key);
|
||||
}
|
||||
|
||||
// See ExpectedState::PrepareDeleteRange()
|
||||
std::vector<PendingExpectedValue> PrepareDeleteRange(int cf,
|
||||
int64_t begin_key,
|
||||
int64_t end_key,
|
||||
bool* prepared) {
|
||||
return latest_->PrepareDeleteRange(cf, begin_key, end_key, prepared);
|
||||
int64_t end_key) {
|
||||
return latest_->PrepareDeleteRange(cf, begin_key, end_key);
|
||||
}
|
||||
|
||||
// See ExpectedState::Exists()
|
||||
|
|
|
@ -10,11 +10,7 @@
|
|||
#include <atomic>
|
||||
|
||||
namespace ROCKSDB_NAMESPACE {
|
||||
bool ExpectedValue::Put(bool pending) {
|
||||
if (pending && (PendingWrite() || PendingDelete())) {
|
||||
return false;
|
||||
}
|
||||
|
||||
void ExpectedValue::Put(bool pending) {
|
||||
if (pending) {
|
||||
SetPendingWrite();
|
||||
} else {
|
||||
|
@ -22,15 +18,10 @@ bool ExpectedValue::Put(bool pending) {
|
|||
ClearDeleted();
|
||||
ClearPendingWrite();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool ExpectedValue::Delete(bool pending) {
|
||||
if (pending && (PendingWrite() || PendingDelete())) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!Exists()) {
|
||||
if (pending && !Exists()) {
|
||||
return false;
|
||||
}
|
||||
if (pending) {
|
||||
|
|
|
@ -37,11 +37,14 @@ class ExpectedValue {
|
|||
explicit ExpectedValue(uint32_t expected_value)
|
||||
: expected_value_(expected_value) {}
|
||||
|
||||
bool Exists() const { return PendingWrite() || !IsDeleted(); }
|
||||
bool Exists() const {
|
||||
assert(!PendingWrite() && !PendingDelete());
|
||||
return !IsDeleted();
|
||||
}
|
||||
|
||||
uint32_t Read() const { return expected_value_; }
|
||||
|
||||
bool Put(bool pending);
|
||||
void Put(bool pending);
|
||||
|
||||
bool Delete(bool pending);
|
||||
|
||||
|
|
|
@ -1619,28 +1619,21 @@ class NonBatchedOpsStressTest : public StressTest {
|
|||
// write
|
||||
bool initial_wal_write_may_succeed = true;
|
||||
|
||||
bool prepared = false;
|
||||
PendingExpectedValue pending_expected_value =
|
||||
shared->PreparePut(rand_column_family, rand_key, &prepared);
|
||||
if (!prepared) {
|
||||
pending_expected_value.PermitUnclosedPendingState();
|
||||
return s;
|
||||
}
|
||||
shared->PreparePut(rand_column_family, rand_key);
|
||||
|
||||
const uint32_t value_base = pending_expected_value.GetFinalValueBase();
|
||||
const size_t sz = GenerateValue(value_base, value, sizeof(value));
|
||||
const Slice v(value, sz);
|
||||
|
||||
uint64_t wait_for_recover_start_time = 0;
|
||||
do {
|
||||
// In order to commit the expected state for the initial write failed with
|
||||
// injected retryable error and successful WAL write, retry the write
|
||||
// until it succeeds after the recovery finishes
|
||||
if (!s.ok() && IsErrorInjectedAndRetryable(s) &&
|
||||
initial_wal_write_may_succeed) {
|
||||
lock.reset();
|
||||
std::this_thread::sleep_for(std::chrono::microseconds(1 * 1000 * 1000));
|
||||
lock.reset(new MutexLock(
|
||||
shared->GetMutexForKey(rand_column_family, rand_key)));
|
||||
}
|
||||
if (FLAGS_use_put_entity_one_in > 0 &&
|
||||
(value_base % FLAGS_use_put_entity_one_in) == 0) {
|
||||
|
@ -1691,13 +1684,10 @@ class NonBatchedOpsStressTest : public StressTest {
|
|||
});
|
||||
}
|
||||
}
|
||||
// Only update `initial_write_s`, `initial_wal_write_may_succeed` when the
|
||||
// first write fails
|
||||
if (!s.ok() && initial_write_s.ok()) {
|
||||
initial_write_s = s;
|
||||
initial_wal_write_may_succeed =
|
||||
!FaultInjectionTestFS::IsFailedToWriteToWALError(initial_write_s);
|
||||
}
|
||||
UpdateIfInitialWriteFails(db_stress_env, s, &initial_write_s,
|
||||
&initial_wal_write_may_succeed,
|
||||
&wait_for_recover_start_time);
|
||||
|
||||
} while (!s.ok() && IsErrorInjectedAndRetryable(s) &&
|
||||
initial_wal_write_may_succeed);
|
||||
|
||||
|
@ -1719,6 +1709,9 @@ class NonBatchedOpsStressTest : public StressTest {
|
|||
thread->shared->SafeTerminate();
|
||||
}
|
||||
} else {
|
||||
PrintWriteRecoveryWaitTimeIfNeeded(
|
||||
db_stress_env, initial_write_s, initial_wal_write_may_succeed,
|
||||
wait_for_recover_start_time, "TestPut");
|
||||
pending_expected_value.Commit();
|
||||
thread->stats.AddBytesForWrites(1, sz);
|
||||
PrintKeyValue(rand_column_family, static_cast<uint32_t>(rand_key), value,
|
||||
|
@ -1756,25 +1749,18 @@ class NonBatchedOpsStressTest : public StressTest {
|
|||
// Use delete if the key may be overwritten and a single deletion
|
||||
// otherwise.
|
||||
if (shared->AllowsOverwrite(rand_key)) {
|
||||
bool prepared = false;
|
||||
PendingExpectedValue pending_expected_value =
|
||||
shared->PrepareDelete(rand_column_family, rand_key, &prepared);
|
||||
if (!prepared) {
|
||||
pending_expected_value.PermitUnclosedPendingState();
|
||||
return s;
|
||||
}
|
||||
shared->PrepareDelete(rand_column_family, rand_key);
|
||||
|
||||
uint64_t wait_for_recover_start_time = 0;
|
||||
do {
|
||||
// In order to commit the expected state for the initial write failed
|
||||
// with injected retryable error and successful WAL write, retry the
|
||||
// write until it succeeds after the recovery finishes
|
||||
if (!s.ok() && IsErrorInjectedAndRetryable(s) &&
|
||||
initial_wal_write_may_succeed) {
|
||||
lock.reset();
|
||||
std::this_thread::sleep_for(
|
||||
std::chrono::microseconds(1 * 1000 * 1000));
|
||||
lock.reset(new MutexLock(
|
||||
shared->GetMutexForKey(rand_column_family, rand_key)));
|
||||
}
|
||||
if (!FLAGS_use_txn) {
|
||||
if (FLAGS_user_timestamp_size == 0) {
|
||||
|
@ -1787,13 +1773,9 @@ class NonBatchedOpsStressTest : public StressTest {
|
|||
return txn.Delete(cfh, key);
|
||||
});
|
||||
}
|
||||
// Only update `initial_write_s`, `initial_wal_write_may_succeed` when
|
||||
// the first write fails
|
||||
if (!s.ok() && initial_write_s.ok()) {
|
||||
initial_write_s = s;
|
||||
initial_wal_write_may_succeed =
|
||||
!FaultInjectionTestFS::IsFailedToWriteToWALError(initial_write_s);
|
||||
}
|
||||
UpdateIfInitialWriteFails(db_stress_env, s, &initial_write_s,
|
||||
&initial_wal_write_may_succeed,
|
||||
&wait_for_recover_start_time);
|
||||
} while (!s.ok() && IsErrorInjectedAndRetryable(s) &&
|
||||
initial_wal_write_may_succeed);
|
||||
|
||||
|
@ -1816,29 +1798,25 @@ class NonBatchedOpsStressTest : public StressTest {
|
|||
thread->shared->SafeTerminate();
|
||||
}
|
||||
} else {
|
||||
PrintWriteRecoveryWaitTimeIfNeeded(
|
||||
db_stress_env, initial_write_s, initial_wal_write_may_succeed,
|
||||
wait_for_recover_start_time, "TestDelete");
|
||||
pending_expected_value.Commit();
|
||||
thread->stats.AddDeletes(1);
|
||||
}
|
||||
} else {
|
||||
bool prepared = false;
|
||||
PendingExpectedValue pending_expected_value =
|
||||
shared->PrepareSingleDelete(rand_column_family, rand_key, &prepared);
|
||||
if (!prepared) {
|
||||
pending_expected_value.PermitUnclosedPendingState();
|
||||
return s;
|
||||
}
|
||||
shared->PrepareSingleDelete(rand_column_family, rand_key);
|
||||
|
||||
uint64_t wait_for_recover_start_time = 0;
|
||||
do {
|
||||
// In order to commit the expected state for the initial write failed
|
||||
// with injected retryable error and successful WAL write, retry the
|
||||
// write until it succeeds after the recovery finishes
|
||||
if (!s.ok() && IsErrorInjectedAndRetryable(s) &&
|
||||
initial_wal_write_may_succeed) {
|
||||
lock.reset();
|
||||
std::this_thread::sleep_for(
|
||||
std::chrono::microseconds(1 * 1000 * 1000));
|
||||
lock.reset(new MutexLock(
|
||||
shared->GetMutexForKey(rand_column_family, rand_key)));
|
||||
}
|
||||
if (!FLAGS_use_txn) {
|
||||
if (FLAGS_user_timestamp_size == 0) {
|
||||
|
@ -1851,13 +1829,9 @@ class NonBatchedOpsStressTest : public StressTest {
|
|||
return txn.SingleDelete(cfh, key);
|
||||
});
|
||||
}
|
||||
// Only update `initial_write_s`, `initial_wal_write_may_succeed` when
|
||||
// the first write fails
|
||||
if (!s.ok() && initial_write_s.ok()) {
|
||||
initial_write_s = s;
|
||||
initial_wal_write_may_succeed =
|
||||
!FaultInjectionTestFS::IsFailedToWriteToWALError(initial_write_s);
|
||||
}
|
||||
UpdateIfInitialWriteFails(db_stress_env, s, &initial_write_s,
|
||||
&initial_wal_write_may_succeed,
|
||||
&wait_for_recover_start_time);
|
||||
} while (!s.ok() && IsErrorInjectedAndRetryable(s) &&
|
||||
initial_wal_write_may_succeed);
|
||||
|
||||
|
@ -1880,6 +1854,9 @@ class NonBatchedOpsStressTest : public StressTest {
|
|||
thread->shared->SafeTerminate();
|
||||
}
|
||||
} else {
|
||||
PrintWriteRecoveryWaitTimeIfNeeded(
|
||||
db_stress_env, initial_write_s, initial_wal_write_may_succeed,
|
||||
wait_for_recover_start_time, "TestDelete");
|
||||
pending_expected_value.Commit();
|
||||
thread->stats.AddSingleDeletes(1);
|
||||
}
|
||||
|
@ -1914,18 +1891,9 @@ class NonBatchedOpsStressTest : public StressTest {
|
|||
// write
|
||||
bool initial_wal_write_may_succeed = true;
|
||||
|
||||
bool prepared = false;
|
||||
std::vector<PendingExpectedValue> pending_expected_values =
|
||||
shared->PrepareDeleteRange(rand_column_family, rand_key,
|
||||
rand_key + FLAGS_range_deletion_width,
|
||||
&prepared);
|
||||
if (!prepared) {
|
||||
for (PendingExpectedValue& pending_expected_value :
|
||||
pending_expected_values) {
|
||||
pending_expected_value.PermitUnclosedPendingState();
|
||||
}
|
||||
return s;
|
||||
}
|
||||
rand_key + FLAGS_range_deletion_width);
|
||||
|
||||
const int covered = static_cast<int>(pending_expected_values.size());
|
||||
std::string keystr = Key(rand_key);
|
||||
|
@ -1935,6 +1903,7 @@ class NonBatchedOpsStressTest : public StressTest {
|
|||
Slice end_key = end_keystr;
|
||||
std::string write_ts_str;
|
||||
Slice write_ts;
|
||||
uint64_t wait_for_recover_start_time = 0;
|
||||
|
||||
do {
|
||||
// In order to commit the expected state for the initial write failed with
|
||||
|
@ -1942,10 +1911,7 @@ class NonBatchedOpsStressTest : public StressTest {
|
|||
// until it succeeds after the recovery finishes
|
||||
if (!s.ok() && IsErrorInjectedAndRetryable(s) &&
|
||||
initial_wal_write_may_succeed) {
|
||||
range_locks.clear();
|
||||
std::this_thread::sleep_for(std::chrono::microseconds(1 * 1000 * 1000));
|
||||
GetDeleteRangeKeyLocks(thread, rand_column_family, rand_key,
|
||||
&range_locks);
|
||||
}
|
||||
if (FLAGS_user_timestamp_size) {
|
||||
write_ts_str = GetNowNanos();
|
||||
|
@ -1954,13 +1920,9 @@ class NonBatchedOpsStressTest : public StressTest {
|
|||
} else {
|
||||
s = db_->DeleteRange(write_opts, cfh, key, end_key);
|
||||
}
|
||||
// Only update `initial_write_s`, `initial_wal_write_may_succeed` when the
|
||||
// first write fails
|
||||
if (!s.ok() && initial_write_s.ok()) {
|
||||
initial_write_s = s;
|
||||
initial_wal_write_may_succeed =
|
||||
!FaultInjectionTestFS::IsFailedToWriteToWALError(initial_write_s);
|
||||
}
|
||||
UpdateIfInitialWriteFails(db_stress_env, s, &initial_write_s,
|
||||
&initial_wal_write_may_succeed,
|
||||
&wait_for_recover_start_time);
|
||||
} while (!s.ok() && IsErrorInjectedAndRetryable(s) &&
|
||||
initial_wal_write_may_succeed);
|
||||
|
||||
|
@ -1985,6 +1947,9 @@ class NonBatchedOpsStressTest : public StressTest {
|
|||
thread->shared->SafeTerminate();
|
||||
}
|
||||
} else {
|
||||
PrintWriteRecoveryWaitTimeIfNeeded(
|
||||
db_stress_env, initial_write_s, initial_wal_write_may_succeed,
|
||||
wait_for_recover_start_time, "TestDeleteRange");
|
||||
for (PendingExpectedValue& pending_expected_value :
|
||||
pending_expected_values) {
|
||||
pending_expected_value.Commit();
|
||||
|
@ -2057,16 +2022,8 @@ class NonBatchedOpsStressTest : public StressTest {
|
|||
}
|
||||
keys.push_back(key);
|
||||
|
||||
bool prepared = false;
|
||||
PendingExpectedValue pending_expected_value =
|
||||
shared->PreparePut(column_family, key, &prepared);
|
||||
if (!prepared) {
|
||||
pending_expected_value.PermitUnclosedPendingState();
|
||||
for (PendingExpectedValue& pev : pending_expected_values) {
|
||||
pev.PermitUnclosedPendingState();
|
||||
}
|
||||
return;
|
||||
}
|
||||
shared->PreparePut(column_family, key);
|
||||
|
||||
const uint32_t value_base = pending_expected_value.GetFinalValueBase();
|
||||
values.push_back(value_base);
|
||||
|
@ -2630,6 +2587,8 @@ class NonBatchedOpsStressTest : public StressTest {
|
|||
// Value doesn't exist in db, update state to reflect that
|
||||
shared->SyncDelete(cf, key);
|
||||
return true;
|
||||
} else {
|
||||
assert(false);
|
||||
}
|
||||
}
|
||||
char expected_value_data[kValueMaxLen];
|
||||
|
@ -2728,7 +2687,11 @@ class NonBatchedOpsStressTest : public StressTest {
|
|||
SharedState* const shared = thread->shared;
|
||||
assert(shared);
|
||||
|
||||
if (!shared->AllowsOverwrite(key) && shared->Exists(column_family, key)) {
|
||||
const ExpectedValue expected_value =
|
||||
thread->shared->Get(column_family, key);
|
||||
bool may_exist = !ExpectedValueHelper::MustHaveNotExisted(expected_value,
|
||||
expected_value);
|
||||
if (!shared->AllowsOverwrite(key) && may_exist) {
|
||||
// Just do read your write checks for keys that allow overwrites.
|
||||
return;
|
||||
}
|
||||
|
|
|
@ -181,10 +181,10 @@ FileOptions FileSystem::OptimizeForBlobFileRead(
|
|||
|
||||
IOStatus WriteStringToFile(FileSystem* fs, const Slice& data,
|
||||
const std::string& fname, bool should_sync,
|
||||
const IOOptions& io_options) {
|
||||
const IOOptions& io_options,
|
||||
const FileOptions& file_options) {
|
||||
std::unique_ptr<FSWritableFile> file;
|
||||
EnvOptions soptions;
|
||||
IOStatus s = fs->NewWritableFile(fname, soptions, &file, nullptr);
|
||||
IOStatus s = fs->NewWritableFile(fname, file_options, &file, nullptr);
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
|
|
|
@ -31,6 +31,7 @@ DeleteScheduler::DeleteScheduler(SystemClock* clock, FileSystem* fs,
|
|||
total_trash_size_(0),
|
||||
rate_bytes_per_sec_(rate_bytes_per_sec),
|
||||
pending_files_(0),
|
||||
next_trash_bucket_(0),
|
||||
bytes_max_delete_chunk_(bytes_max_delete_chunk),
|
||||
closing_(false),
|
||||
cv_(&mu_),
|
||||
|
@ -66,10 +67,8 @@ Status DeleteScheduler::DeleteFile(const std::string& file_path,
|
|||
total_trash_size_.load() > total_size * max_trash_db_ratio_.load())) {
|
||||
// Rate limiting is disabled or trash size makes up more than
|
||||
// max_trash_db_ratio_ (default 25%) of the total DB size
|
||||
TEST_SYNC_POINT("DeleteScheduler::DeleteFile");
|
||||
Status s = fs_->DeleteFile(file_path, IOOptions(), nullptr);
|
||||
Status s = DeleteFileImmediately(file_path, /*accounted=*/true);
|
||||
if (s.ok()) {
|
||||
s = sst_file_manager_->OnDeleteFile(file_path);
|
||||
ROCKS_LOG_INFO(info_log_,
|
||||
"Deleted file %s immediately, rate_bytes_per_sec %" PRIi64
|
||||
", total_trash_size %" PRIu64 ", total_size %" PRIi64
|
||||
|
@ -77,15 +76,57 @@ Status DeleteScheduler::DeleteFile(const std::string& file_path,
|
|||
file_path.c_str(), rate_bytes_per_sec_.load(),
|
||||
total_trash_size_.load(), total_size,
|
||||
max_trash_db_ratio_.load());
|
||||
InstrumentedMutexLock l(&mu_);
|
||||
RecordTick(stats_.get(), FILES_DELETED_IMMEDIATELY);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
return AddFileToDeletionQueue(file_path, dir_to_sync, /*bucket=*/std::nullopt,
|
||||
/*accounted=*/true);
|
||||
}
|
||||
|
||||
Status DeleteScheduler::DeleteUnaccountedFile(const std::string& file_path,
|
||||
const std::string& dir_to_sync,
|
||||
const bool force_bg,
|
||||
std::optional<int32_t> bucket) {
|
||||
uint64_t num_hard_links = 1;
|
||||
fs_->NumFileLinks(file_path, IOOptions(), &num_hard_links, nullptr)
|
||||
.PermitUncheckedError();
|
||||
|
||||
// We can tolerate rare races where we might immediately delete both links
|
||||
// to a file.
|
||||
if (rate_bytes_per_sec_.load() <= 0 || (!force_bg && num_hard_links > 1)) {
|
||||
Status s = DeleteFileImmediately(file_path, /*accounted=*/false);
|
||||
if (s.ok()) {
|
||||
ROCKS_LOG_INFO(info_log_,
|
||||
"Deleted file %s immediately, rate_bytes_per_sec %" PRIi64,
|
||||
file_path.c_str(), rate_bytes_per_sec_.load());
|
||||
}
|
||||
return s;
|
||||
}
|
||||
return AddFileToDeletionQueue(file_path, dir_to_sync, bucket,
|
||||
/*accounted=*/false);
|
||||
}
|
||||
|
||||
Status DeleteScheduler::DeleteFileImmediately(const std::string& file_path,
|
||||
bool accounted) {
|
||||
TEST_SYNC_POINT("DeleteScheduler::DeleteFile");
|
||||
TEST_SYNC_POINT_CALLBACK("DeleteScheduler::DeleteFile::cb",
|
||||
const_cast<std::string*>(&file_path));
|
||||
Status s = fs_->DeleteFile(file_path, IOOptions(), nullptr);
|
||||
if (s.ok()) {
|
||||
s = OnDeleteFile(file_path, accounted);
|
||||
InstrumentedMutexLock l(&mu_);
|
||||
RecordTick(stats_.get(), FILES_DELETED_IMMEDIATELY);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
Status DeleteScheduler::AddFileToDeletionQueue(const std::string& file_path,
|
||||
const std::string& dir_to_sync,
|
||||
std::optional<int32_t> bucket,
|
||||
bool accounted) {
|
||||
// Move file to trash
|
||||
std::string trash_file;
|
||||
Status s = MarkAsTrash(file_path, &trash_file);
|
||||
Status s = MarkAsTrash(file_path, accounted, &trash_file);
|
||||
ROCKS_LOG_INFO(info_log_, "Mark file: %s as trash -- %s", trash_file.c_str(),
|
||||
s.ToString().c_str());
|
||||
|
||||
|
@ -94,7 +135,7 @@ Status DeleteScheduler::DeleteFile(const std::string& file_path,
|
|||
file_path.c_str(), s.ToString().c_str());
|
||||
s = fs_->DeleteFile(file_path, IOOptions(), nullptr);
|
||||
if (s.ok()) {
|
||||
s = sst_file_manager_->OnDeleteFile(file_path);
|
||||
s = OnDeleteFile(file_path, accounted);
|
||||
ROCKS_LOG_INFO(info_log_, "Deleted file %s immediately",
|
||||
trash_file.c_str());
|
||||
InstrumentedMutexLock l(&mu_);
|
||||
|
@ -104,11 +145,13 @@ Status DeleteScheduler::DeleteFile(const std::string& file_path,
|
|||
}
|
||||
|
||||
// Update the total trash size
|
||||
uint64_t trash_file_size = 0;
|
||||
IOStatus io_s =
|
||||
fs_->GetFileSize(trash_file, IOOptions(), &trash_file_size, nullptr);
|
||||
if (io_s.ok()) {
|
||||
total_trash_size_.fetch_add(trash_file_size);
|
||||
if (accounted) {
|
||||
uint64_t trash_file_size = 0;
|
||||
IOStatus io_s =
|
||||
fs_->GetFileSize(trash_file, IOOptions(), &trash_file_size, nullptr);
|
||||
if (io_s.ok()) {
|
||||
total_trash_size_.fetch_add(trash_file_size);
|
||||
}
|
||||
}
|
||||
//**TODO: What should we do if we failed to
|
||||
// get the file size?
|
||||
|
@ -117,8 +160,15 @@ Status DeleteScheduler::DeleteFile(const std::string& file_path,
|
|||
{
|
||||
InstrumentedMutexLock l(&mu_);
|
||||
RecordTick(stats_.get(), FILES_MARKED_TRASH);
|
||||
queue_.emplace(trash_file, dir_to_sync);
|
||||
queue_.emplace(trash_file, dir_to_sync, accounted, bucket);
|
||||
pending_files_++;
|
||||
if (bucket.has_value()) {
|
||||
auto iter = pending_files_in_buckets_.find(bucket.value());
|
||||
assert(iter != pending_files_in_buckets_.end());
|
||||
if (iter != pending_files_in_buckets_.end()) {
|
||||
iter->second++;
|
||||
}
|
||||
}
|
||||
if (pending_files_ == 1) {
|
||||
cv_.SignalAll();
|
||||
}
|
||||
|
@ -177,7 +227,7 @@ Status DeleteScheduler::CleanupDirectory(Env* env, SstFileManagerImpl* sfm,
|
|||
}
|
||||
|
||||
Status DeleteScheduler::MarkAsTrash(const std::string& file_path,
|
||||
std::string* trash_file) {
|
||||
bool accounted, std::string* trash_file) {
|
||||
// Sanity check of the path
|
||||
size_t idx = file_path.rfind('/');
|
||||
if (idx == std::string::npos || idx == file_path.size() - 1) {
|
||||
|
@ -211,7 +261,7 @@ Status DeleteScheduler::MarkAsTrash(const std::string& file_path,
|
|||
}
|
||||
cnt++;
|
||||
}
|
||||
if (s.ok()) {
|
||||
if (s.ok() && accounted) {
|
||||
s = sst_file_manager_->OnMoveFile(file_path, *trash_file);
|
||||
}
|
||||
return s;
|
||||
|
@ -235,6 +285,8 @@ void DeleteScheduler::BackgroundEmptyTrash() {
|
|||
uint64_t total_deleted_bytes = 0;
|
||||
int64_t current_delete_rate = rate_bytes_per_sec_.load();
|
||||
while (!queue_.empty() && !closing_) {
|
||||
// Satisfy static analysis.
|
||||
std::optional<int32_t> bucket = std::nullopt;
|
||||
if (current_delete_rate != rate_bytes_per_sec_.load()) {
|
||||
// User changed the delete rate
|
||||
current_delete_rate = rate_bytes_per_sec_.load();
|
||||
|
@ -247,14 +299,17 @@ void DeleteScheduler::BackgroundEmptyTrash() {
|
|||
// Get new file to delete
|
||||
const FileAndDir& fad = queue_.front();
|
||||
std::string path_in_trash = fad.fname;
|
||||
std::string dir_to_sync = fad.dir;
|
||||
bool accounted = fad.accounted;
|
||||
bucket = fad.bucket;
|
||||
|
||||
// We don't need to hold the lock while deleting the file
|
||||
mu_.Unlock();
|
||||
uint64_t deleted_bytes = 0;
|
||||
bool is_complete = true;
|
||||
// Delete file from trash and update total_penlty value
|
||||
Status s =
|
||||
DeleteTrashFile(path_in_trash, fad.dir, &deleted_bytes, &is_complete);
|
||||
Status s = DeleteTrashFile(path_in_trash, dir_to_sync, accounted,
|
||||
&deleted_bytes, &is_complete);
|
||||
total_deleted_bytes += deleted_bytes;
|
||||
mu_.Lock();
|
||||
if (is_complete) {
|
||||
|
@ -288,12 +343,20 @@ void DeleteScheduler::BackgroundEmptyTrash() {
|
|||
TEST_SYNC_POINT_CALLBACK("DeleteScheduler::BackgroundEmptyTrash:Wait",
|
||||
&total_penalty);
|
||||
|
||||
int32_t pending_files_in_bucket = std::numeric_limits<int32_t>::max();
|
||||
if (is_complete) {
|
||||
pending_files_--;
|
||||
if (bucket.has_value()) {
|
||||
auto iter = pending_files_in_buckets_.find(bucket.value());
|
||||
assert(iter != pending_files_in_buckets_.end());
|
||||
if (iter != pending_files_in_buckets_.end()) {
|
||||
pending_files_in_bucket = iter->second--;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (pending_files_ == 0) {
|
||||
// Unblock WaitForEmptyTrash since there are no more files waiting
|
||||
// to be deleted
|
||||
if (pending_files_ == 0 || pending_files_in_bucket == 0) {
|
||||
// Unblock WaitForEmptyTrash or WaitForEmptyTrashBucket since there are
|
||||
// no more files waiting to be deleted
|
||||
cv_.SignalAll();
|
||||
}
|
||||
}
|
||||
|
@ -302,12 +365,14 @@ void DeleteScheduler::BackgroundEmptyTrash() {
|
|||
|
||||
Status DeleteScheduler::DeleteTrashFile(const std::string& path_in_trash,
|
||||
const std::string& dir_to_sync,
|
||||
uint64_t* deleted_bytes,
|
||||
bool accounted, uint64_t* deleted_bytes,
|
||||
bool* is_complete) {
|
||||
uint64_t file_size;
|
||||
Status s = fs_->GetFileSize(path_in_trash, IOOptions(), &file_size, nullptr);
|
||||
*is_complete = true;
|
||||
TEST_SYNC_POINT("DeleteScheduler::DeleteTrashFile:DeleteFile");
|
||||
TEST_SYNC_POINT_CALLBACK("DeleteScheduler::DeleteTrashFile::cb",
|
||||
const_cast<std::string*>(&path_in_trash));
|
||||
if (s.ok()) {
|
||||
bool need_full_delete = true;
|
||||
if (bytes_max_delete_chunk_ != 0 && file_size > bytes_max_delete_chunk_) {
|
||||
|
@ -374,7 +439,7 @@ Status DeleteScheduler::DeleteTrashFile(const std::string& path_in_trash,
|
|||
}
|
||||
if (s.ok()) {
|
||||
*deleted_bytes = file_size;
|
||||
s = sst_file_manager_->OnDeleteFile(path_in_trash);
|
||||
s = OnDeleteFile(path_in_trash, accounted);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -384,12 +449,24 @@ Status DeleteScheduler::DeleteTrashFile(const std::string& path_in_trash,
|
|||
path_in_trash.c_str(), s.ToString().c_str());
|
||||
*deleted_bytes = 0;
|
||||
} else {
|
||||
total_trash_size_.fetch_sub(*deleted_bytes);
|
||||
if (accounted) {
|
||||
total_trash_size_.fetch_sub(*deleted_bytes);
|
||||
}
|
||||
}
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
Status DeleteScheduler::OnDeleteFile(const std::string& file_path,
|
||||
bool accounted) {
|
||||
if (accounted) {
|
||||
return sst_file_manager_->OnDeleteFile(file_path);
|
||||
}
|
||||
TEST_SYNC_POINT_CALLBACK("DeleteScheduler::OnDeleteFile",
|
||||
const_cast<std::string*>(&file_path));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
void DeleteScheduler::WaitForEmptyTrash() {
|
||||
InstrumentedMutexLock l(&mu_);
|
||||
while (pending_files_ > 0 && !closing_) {
|
||||
|
@ -397,6 +474,30 @@ void DeleteScheduler::WaitForEmptyTrash() {
|
|||
}
|
||||
}
|
||||
|
||||
std::optional<int32_t> DeleteScheduler::NewTrashBucket() {
|
||||
if (rate_bytes_per_sec_.load() <= 0) {
|
||||
return std::nullopt;
|
||||
}
|
||||
InstrumentedMutexLock l(&mu_);
|
||||
int32_t bucket_number = next_trash_bucket_++;
|
||||
pending_files_in_buckets_.emplace(bucket_number, 0);
|
||||
return bucket_number;
|
||||
}
|
||||
|
||||
void DeleteScheduler::WaitForEmptyTrashBucket(int32_t bucket) {
|
||||
InstrumentedMutexLock l(&mu_);
|
||||
if (bucket >= next_trash_bucket_) {
|
||||
return;
|
||||
}
|
||||
auto iter = pending_files_in_buckets_.find(bucket);
|
||||
while (iter != pending_files_in_buckets_.end() && iter->second > 0 &&
|
||||
!closing_) {
|
||||
cv_.Wait();
|
||||
iter = pending_files_in_buckets_.find(bucket);
|
||||
}
|
||||
pending_files_in_buckets_.erase(bucket);
|
||||
}
|
||||
|
||||
void DeleteScheduler::MaybeCreateBackgroundThread() {
|
||||
if (bg_thread_ == nullptr && rate_bytes_per_sec_.load() > 0) {
|
||||
bg_thread_.reset(
|
||||
|
|
|
@ -7,6 +7,7 @@
|
|||
|
||||
|
||||
#include <map>
|
||||
#include <optional>
|
||||
#include <queue>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
|
@ -48,16 +49,45 @@ class DeleteScheduler {
|
|||
MaybeCreateBackgroundThread();
|
||||
}
|
||||
|
||||
// Mark file as trash directory and schedule its deletion. If force_bg is
|
||||
// set, it forces the file to always be deleted in the background thread,
|
||||
// except when rate limiting is disabled
|
||||
// Delete an accounted file that is tracked by `SstFileManager` and should be
|
||||
// tracked by this `DeleteScheduler` when it's deleted.
|
||||
// The file is deleted immediately if slow deletion is disabled. If force_bg
|
||||
// is not set and trash to db size ratio exceeded the configured threshold,
|
||||
// it is immediately deleted too. In all other cases, the file will be moved
|
||||
// to a trash directory and scheduled for deletion by a background thread.
|
||||
Status DeleteFile(const std::string& fname, const std::string& dir_to_sync,
|
||||
const bool force_bg = false);
|
||||
|
||||
// Wait for all files being deleteing in the background to finish or for
|
||||
// Delete an unaccounted file that is not tracked by `SstFileManager` and
|
||||
// should not be tracked by this `DeleteScheduler` when it's deleted.
|
||||
// The file is deleted immediately if slow deletion is disabled. If force_bg
|
||||
// is not set and the file have more than 1 hard link, it is immediately
|
||||
// deleted too. In all other cases, the file will be moved to a trash
|
||||
// directory and scheduled for deletion by a background thread.
|
||||
// This API also supports assign a file to a specified bucket created by
|
||||
// `NewTrashBucket` when delete files in the background. So the caller can
|
||||
// wait for a specific bucket to be empty by checking the
|
||||
// `WaitForEmptyTrashBucket` API.
|
||||
Status DeleteUnaccountedFile(const std::string& file_path,
|
||||
const std::string& dir_to_sync,
|
||||
const bool force_bg = false,
|
||||
std::optional<int32_t> bucket = std::nullopt);
|
||||
|
||||
// Wait for all files being deleted in the background to finish or for
|
||||
// destructor to be called.
|
||||
void WaitForEmptyTrash();
|
||||
|
||||
// Creates a new trash bucket. A bucket is only created and returned when slow
|
||||
// deletion is enabled.
|
||||
// For each bucket that is created, the user should also call
|
||||
// `WaitForEmptyTrashBucket` after scheduling file deletions to make sure the
|
||||
// trash files are all cleared.
|
||||
std::optional<int32_t> NewTrashBucket();
|
||||
|
||||
// Wait for all the files in the specified bucket to be deleted in the
|
||||
// background or for the destructor to be called.
|
||||
void WaitForEmptyTrashBucket(int32_t bucket);
|
||||
|
||||
// Return a map containing errors that happened in BackgroundEmptyTrash
|
||||
// file_path => error status
|
||||
std::map<std::string, Status> GetBackgroundErrors();
|
||||
|
@ -87,12 +117,21 @@ class DeleteScheduler {
|
|||
}
|
||||
|
||||
private:
|
||||
Status MarkAsTrash(const std::string& file_path, std::string* path_in_trash);
|
||||
Status DeleteFileImmediately(const std::string& file_path, bool accounted);
|
||||
|
||||
Status AddFileToDeletionQueue(const std::string& file_path,
|
||||
const std::string& dir_to_sync,
|
||||
std::optional<int32_t> bucket, bool accounted);
|
||||
|
||||
Status MarkAsTrash(const std::string& file_path, bool accounted,
|
||||
std::string* path_in_trash);
|
||||
|
||||
Status DeleteTrashFile(const std::string& path_in_trash,
|
||||
const std::string& dir_to_sync,
|
||||
const std::string& dir_to_sync, bool accounted,
|
||||
uint64_t* deleted_bytes, bool* is_complete);
|
||||
|
||||
Status OnDeleteFile(const std::string& file_path, bool accounted);
|
||||
|
||||
void BackgroundEmptyTrash();
|
||||
|
||||
void MaybeCreateBackgroundThread();
|
||||
|
@ -104,19 +143,28 @@ class DeleteScheduler {
|
|||
std::atomic<uint64_t> total_trash_size_;
|
||||
// Maximum number of bytes that should be deleted per second
|
||||
std::atomic<int64_t> rate_bytes_per_sec_;
|
||||
// Mutex to protect queue_, pending_files_, bg_errors_, closing_, stats_
|
||||
// Mutex to protect queue_, pending_files_, next_trash_bucket_,
|
||||
// pending_files_in_buckets_, bg_errors_, closing_, stats_
|
||||
InstrumentedMutex mu_;
|
||||
|
||||
struct FileAndDir {
|
||||
FileAndDir(const std::string& f, const std::string& d) : fname(f), dir(d) {}
|
||||
FileAndDir(const std::string& _fname, const std::string& _dir,
|
||||
bool _accounted, std::optional<int32_t> _bucket)
|
||||
: fname(_fname), dir(_dir), accounted(_accounted), bucket(_bucket) {}
|
||||
std::string fname;
|
||||
std::string dir; // empty will be skipped.
|
||||
bool accounted;
|
||||
std::optional<int32_t> bucket;
|
||||
};
|
||||
|
||||
// Queue of trash files that need to be deleted
|
||||
std::queue<FileAndDir> queue_;
|
||||
// Number of trash files that are waiting to be deleted
|
||||
int32_t pending_files_;
|
||||
// Next trash bucket that can be created
|
||||
int32_t next_trash_bucket_;
|
||||
// A mapping from trash bucket to number of pending files in the bucket
|
||||
std::map<int32_t, int32_t> pending_files_in_buckets_;
|
||||
uint64_t bytes_max_delete_chunk_;
|
||||
// Errors that happened in BackgroundEmptyTrash (file_path => error)
|
||||
std::map<std::string, Status> bg_errors_;
|
||||
|
@ -127,6 +175,7 @@ class DeleteScheduler {
|
|||
// Condition variable signaled in these conditions
|
||||
// - pending_files_ value change from 0 => 1
|
||||
// - pending_files_ value change from 1 => 0
|
||||
// - a value in pending_files_in_buckets change from 1 => 0
|
||||
// - closing_ value is set to true
|
||||
InstrumentedCondVar cv_;
|
||||
// Background thread running BackgroundEmptyTrash
|
||||
|
@ -138,6 +187,10 @@ class DeleteScheduler {
|
|||
// If the trash size constitutes for more than this fraction of the total DB
|
||||
// size we will start deleting new files passed to DeleteScheduler
|
||||
// immediately
|
||||
// Unaccounted files passed for deletion will not cause change in
|
||||
// total_trash_size_ or affect the DeleteScheduler::total_trash_size_ over
|
||||
// SstFileManager::total_size_ ratio. Their slow deletion is not subject to
|
||||
// this configured threshold either.
|
||||
std::atomic<double> max_trash_db_ratio_;
|
||||
static const uint64_t kMicrosInSecond = 1000 * 1000LL;
|
||||
std::shared_ptr<Statistics> stats_;
|
||||
|
|
|
@ -78,7 +78,7 @@ class DeleteSchedulerTest : public testing::Test {
|
|||
}
|
||||
|
||||
std::string NewDummyFile(const std::string& file_name, uint64_t size = 1024,
|
||||
size_t dummy_files_dirs_idx = 0) {
|
||||
size_t dummy_files_dirs_idx = 0, bool track = true) {
|
||||
std::string file_path =
|
||||
dummy_files_dirs_[dummy_files_dirs_idx] + "/" + file_name;
|
||||
std::unique_ptr<WritableFile> f;
|
||||
|
@ -86,7 +86,9 @@ class DeleteSchedulerTest : public testing::Test {
|
|||
std::string data(size, 'A');
|
||||
EXPECT_OK(f->Append(data));
|
||||
EXPECT_OK(f->Close());
|
||||
EXPECT_OK(sst_file_mgr_->OnAddFile(file_path));
|
||||
if (track) {
|
||||
EXPECT_OK(sst_file_mgr_->OnAddFile(file_path));
|
||||
}
|
||||
return file_path;
|
||||
}
|
||||
|
||||
|
@ -353,6 +355,8 @@ TEST_F(DeleteSchedulerTest, DisableRateLimiting) {
|
|||
ASSERT_EQ(num_files,
|
||||
stats_->getAndResetTickerCount(FILES_DELETED_IMMEDIATELY));
|
||||
|
||||
ASSERT_FALSE(delete_scheduler_->NewTrashBucket().has_value());
|
||||
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
|
||||
}
|
||||
|
||||
|
@ -718,6 +722,141 @@ TEST_F(DeleteSchedulerTest, IsTrashCheck) {
|
|||
ASSERT_FALSE(DeleteScheduler::IsTrashFile("abc.trashx"));
|
||||
}
|
||||
|
||||
TEST_F(DeleteSchedulerTest, DeleteAccountedAndUnaccountedFiles) {
|
||||
rate_bytes_per_sec_ = 1024 * 1024; // 1 MB / s
|
||||
NewDeleteScheduler();
|
||||
|
||||
// Create 100 files, every file is 1 KB
|
||||
int num_files = 100; // 100 files
|
||||
uint64_t file_size = 1024; // 1 KB as a file size
|
||||
std::vector<std::string> generated_files;
|
||||
for (int i = 0; i < num_files; i++) {
|
||||
std::string file_name = "file" + std::to_string(i) + ".data";
|
||||
generated_files.push_back(NewDummyFile(file_name, file_size,
|
||||
/*dummy_files_dirs_idx*/ 0,
|
||||
/*track=*/false));
|
||||
}
|
||||
|
||||
for (int i = 0; i < num_files; i++) {
|
||||
if (i % 2) {
|
||||
ASSERT_OK(sst_file_mgr_->OnAddFile(generated_files[i], file_size));
|
||||
ASSERT_OK(delete_scheduler_->DeleteFile(generated_files[i], ""));
|
||||
} else {
|
||||
ASSERT_OK(
|
||||
delete_scheduler_->DeleteUnaccountedFile(generated_files[i], ""));
|
||||
}
|
||||
}
|
||||
|
||||
delete_scheduler_->WaitForEmptyTrash();
|
||||
ASSERT_EQ(0, delete_scheduler_->GetTotalTrashSize());
|
||||
ASSERT_EQ(0, sst_file_mgr_->GetTotalSize());
|
||||
}
|
||||
|
||||
TEST_F(DeleteSchedulerTest, ConcurrentlyDeleteUnaccountedFilesInBuckets) {
|
||||
int bg_delete_file = 0;
|
||||
int fg_delete_file = 0;
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
||||
"DeleteScheduler::DeleteTrashFile:DeleteFile",
|
||||
[&](void* /*arg*/) { bg_delete_file++; });
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
||||
"DeleteScheduler::DeleteFile", [&](void* /*arg*/) { fg_delete_file++; });
|
||||
rate_bytes_per_sec_ = 1024 * 1024; // 1 MB / s
|
||||
NewDeleteScheduler();
|
||||
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
||||
// Create 1000 files, every file is 1 KB
|
||||
int num_files = 1000;
|
||||
uint64_t file_size = 1024; // 1 KB as a file size
|
||||
std::vector<std::string> generated_files;
|
||||
for (int i = 0; i < num_files; i++) {
|
||||
std::string file_name = "file" + std::to_string(i) + ".data";
|
||||
generated_files.push_back(NewDummyFile(file_name, file_size,
|
||||
/*dummy_files_dirs_idx*/ 0,
|
||||
/*track=*/false));
|
||||
}
|
||||
// Concurrently delete files in different buckets and check all the buckets
|
||||
// are empty.
|
||||
int thread_cnt = 10;
|
||||
int files_per_thread = 100;
|
||||
std::atomic<int> thread_num(0);
|
||||
std::vector<port::Thread> threads;
|
||||
std::function<void()> delete_thread = [&]() {
|
||||
std::optional<int32_t> bucket = delete_scheduler_->NewTrashBucket();
|
||||
ASSERT_TRUE(bucket.has_value());
|
||||
int idx = thread_num.fetch_add(1);
|
||||
int range_start = idx * files_per_thread;
|
||||
int range_end = range_start + files_per_thread;
|
||||
for (int j = range_start; j < range_end; j++) {
|
||||
ASSERT_OK(delete_scheduler_->DeleteUnaccountedFile(
|
||||
generated_files[j], "", /*false_bg=*/false, bucket));
|
||||
}
|
||||
delete_scheduler_->WaitForEmptyTrashBucket(bucket.value());
|
||||
};
|
||||
|
||||
for (int i = 0; i < thread_cnt; i++) {
|
||||
threads.emplace_back(delete_thread);
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < threads.size(); i++) {
|
||||
threads[i].join();
|
||||
}
|
||||
|
||||
ASSERT_EQ(0, delete_scheduler_->GetTotalTrashSize());
|
||||
ASSERT_EQ(0, stats_->getAndResetTickerCount(FILES_DELETED_IMMEDIATELY));
|
||||
ASSERT_EQ(1000, stats_->getAndResetTickerCount(FILES_MARKED_TRASH));
|
||||
ASSERT_EQ(0, fg_delete_file);
|
||||
ASSERT_EQ(1000, bg_delete_file);
|
||||
|
||||
// OK to re check an already empty bucket
|
||||
delete_scheduler_->WaitForEmptyTrashBucket(9);
|
||||
// Invalid bucket return too.
|
||||
delete_scheduler_->WaitForEmptyTrashBucket(100);
|
||||
std::optional<int32_t> next_bucket = delete_scheduler_->NewTrashBucket();
|
||||
ASSERT_TRUE(next_bucket.has_value());
|
||||
ASSERT_EQ(10, next_bucket.value());
|
||||
delete_scheduler_->WaitForEmptyTrashBucket(10);
|
||||
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
|
||||
}
|
||||
|
||||
TEST_F(DeleteSchedulerTest,
|
||||
ImmediatelyDeleteUnaccountedFilesWithRemainingLinks) {
|
||||
int bg_delete_file = 0;
|
||||
int fg_delete_file = 0;
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
||||
"DeleteScheduler::DeleteTrashFile:DeleteFile",
|
||||
[&](void* /*arg*/) { bg_delete_file++; });
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
||||
"DeleteScheduler::DeleteFile", [&](void* /*arg*/) { fg_delete_file++; });
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
||||
|
||||
rate_bytes_per_sec_ = 1024 * 1024; // 1 MB / sec
|
||||
NewDeleteScheduler();
|
||||
|
||||
std::string file1 = NewDummyFile("data_1", 500 * 1024,
|
||||
/*dummy_files_dirs_idx*/ 0, /*track=*/false);
|
||||
std::string file2 = NewDummyFile("data_2", 100 * 1024,
|
||||
/*dummy_files_dirs_idx*/ 0, /*track=*/false);
|
||||
|
||||
ASSERT_OK(env_->LinkFile(file1, dummy_files_dirs_[0] + "/data_1b"));
|
||||
ASSERT_OK(env_->LinkFile(file2, dummy_files_dirs_[0] + "/data_2b"));
|
||||
|
||||
// Should delete in 4 batch if there is no hardlink
|
||||
ASSERT_OK(
|
||||
delete_scheduler_->DeleteUnaccountedFile(file1, "", /*force_bg=*/false));
|
||||
ASSERT_OK(
|
||||
delete_scheduler_->DeleteUnaccountedFile(file2, "", /*force_bg=*/false));
|
||||
|
||||
delete_scheduler_->WaitForEmptyTrash();
|
||||
|
||||
ASSERT_EQ(0, delete_scheduler_->GetTotalTrashSize());
|
||||
ASSERT_EQ(0, bg_delete_file);
|
||||
ASSERT_EQ(2, fg_delete_file);
|
||||
ASSERT_EQ(0, stats_->getAndResetTickerCount(FILES_MARKED_TRASH));
|
||||
ASSERT_EQ(2, stats_->getAndResetTickerCount(FILES_DELETED_IMMEDIATELY));
|
||||
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
||||
}
|
||||
|
||||
} // namespace ROCKSDB_NAMESPACE
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
|
|
|
@ -125,8 +125,8 @@ IOStatus CreateFile(FileSystem* fs, const std::string& destination,
|
|||
Status DeleteDBFile(const ImmutableDBOptions* db_options,
|
||||
const std::string& fname, const std::string& dir_to_sync,
|
||||
const bool force_bg, const bool force_fg) {
|
||||
SstFileManagerImpl* sfm =
|
||||
static_cast<SstFileManagerImpl*>(db_options->sst_file_manager.get());
|
||||
SstFileManagerImpl* sfm = static_cast_with_check<SstFileManagerImpl>(
|
||||
db_options->sst_file_manager.get());
|
||||
if (sfm && !force_fg) {
|
||||
return sfm->ScheduleFileDeletion(fname, dir_to_sync, force_bg);
|
||||
} else {
|
||||
|
@ -134,6 +134,21 @@ Status DeleteDBFile(const ImmutableDBOptions* db_options,
|
|||
}
|
||||
}
|
||||
|
||||
Status DeleteUnaccountedDBFile(const ImmutableDBOptions* db_options,
|
||||
const std::string& fname,
|
||||
const std::string& dir_to_sync,
|
||||
const bool force_bg, const bool force_fg,
|
||||
std::optional<int32_t> bucket) {
|
||||
SstFileManagerImpl* sfm = static_cast_with_check<SstFileManagerImpl>(
|
||||
db_options->sst_file_manager.get());
|
||||
if (sfm && !force_fg) {
|
||||
return sfm->ScheduleUnaccountedFileDeletion(fname, dir_to_sync, force_bg,
|
||||
bucket);
|
||||
} else {
|
||||
return db_options->env->DeleteFile(fname);
|
||||
}
|
||||
}
|
||||
|
||||
// requested_checksum_func_name brings the function name of the checksum
|
||||
// generator in checksum_factory. Empty string is permitted, in which case the
|
||||
// name of the generator created by the factory is unchecked. When
|
||||
|
|
|
@ -55,6 +55,16 @@ Status DeleteDBFile(const ImmutableDBOptions* db_options,
|
|||
const std::string& fname, const std::string& path_to_sync,
|
||||
const bool force_bg, const bool force_fg);
|
||||
|
||||
// Delete an unaccounted DB file that is not tracked by SstFileManager and will
|
||||
// not be tracked by its DeleteScheduler when getting deleted.
|
||||
// If a legitimate bucket is provided and this file is scheduled for slow
|
||||
// deletion, it will be assigned to the specified trash bucket.
|
||||
Status DeleteUnaccountedDBFile(const ImmutableDBOptions* db_options,
|
||||
const std::string& fname,
|
||||
const std::string& dir_to_sync,
|
||||
const bool force_bg, const bool force_fg,
|
||||
std::optional<int32_t> bucket);
|
||||
|
||||
// TODO(hx235): pass the whole DBOptions intead of its individual fields
|
||||
IOStatus GenerateOneFileChecksum(
|
||||
FileSystem* fs, const std::string& file_path,
|
||||
|
|
|
@ -388,6 +388,7 @@ bool ParseFileName(const std::string& fname, uint64_t* number,
|
|||
|
||||
IOStatus SetCurrentFile(const WriteOptions& write_options, FileSystem* fs,
|
||||
const std::string& dbname, uint64_t descriptor_number,
|
||||
Temperature temp,
|
||||
FSDirectory* dir_contains_current_file) {
|
||||
// Remove leading "dbname/" and add newline to manifest file name
|
||||
std::string manifest = DescriptorFileName(dbname, descriptor_number);
|
||||
|
@ -397,8 +398,11 @@ IOStatus SetCurrentFile(const WriteOptions& write_options, FileSystem* fs,
|
|||
std::string tmp = TempFileName(dbname, descriptor_number);
|
||||
IOOptions opts;
|
||||
IOStatus s = PrepareIOFromWriteOptions(write_options, opts);
|
||||
FileOptions file_opts;
|
||||
file_opts.temperature = temp;
|
||||
if (s.ok()) {
|
||||
s = WriteStringToFile(fs, contents.ToString() + "\n", tmp, true, opts);
|
||||
s = WriteStringToFile(fs, contents.ToString() + "\n", tmp, true, opts,
|
||||
file_opts);
|
||||
}
|
||||
TEST_SYNC_POINT_CALLBACK("SetCurrentFile:BeforeRename", &s);
|
||||
if (s.ok()) {
|
||||
|
@ -423,7 +427,8 @@ IOStatus SetCurrentFile(const WriteOptions& write_options, FileSystem* fs,
|
|||
}
|
||||
|
||||
Status SetIdentityFile(const WriteOptions& write_options, Env* env,
|
||||
const std::string& dbname, const std::string& db_id) {
|
||||
const std::string& dbname, Temperature temp,
|
||||
const std::string& db_id) {
|
||||
std::string id;
|
||||
if (db_id.empty()) {
|
||||
id = env->GenerateUniqueId();
|
||||
|
@ -437,8 +442,11 @@ Status SetIdentityFile(const WriteOptions& write_options, Env* env,
|
|||
Status s;
|
||||
IOOptions opts;
|
||||
s = PrepareIOFromWriteOptions(write_options, opts);
|
||||
FileOptions file_opts;
|
||||
file_opts.temperature = temp;
|
||||
if (s.ok()) {
|
||||
s = WriteStringToFile(env, id, tmp, true, &opts);
|
||||
s = WriteStringToFile(env->GetFileSystem().get(), id, tmp,
|
||||
/*should_sync=*/true, opts, file_opts);
|
||||
}
|
||||
if (s.ok()) {
|
||||
s = env->RenameFile(tmp, identify_file_name);
|
||||
|
|
|
@ -161,11 +161,12 @@ bool ParseFileName(const std::string& filename, uint64_t* number,
|
|||
// when
|
||||
IOStatus SetCurrentFile(const WriteOptions& write_options, FileSystem* fs,
|
||||
const std::string& dbname, uint64_t descriptor_number,
|
||||
Temperature temp,
|
||||
FSDirectory* dir_contains_current_file);
|
||||
|
||||
// Make the IDENTITY file for the db
|
||||
Status SetIdentityFile(const WriteOptions& write_options, Env* env,
|
||||
const std::string& dbname,
|
||||
const std::string& dbname, Temperature temp,
|
||||
const std::string& db_id = {});
|
||||
|
||||
// Sync manifest file `file`.
|
||||
|
|
|
@ -421,10 +421,28 @@ Status SstFileManagerImpl::ScheduleFileDeletion(const std::string& file_path,
|
|||
return delete_scheduler_.DeleteFile(file_path, path_to_sync, force_bg);
|
||||
}
|
||||
|
||||
Status SstFileManagerImpl::ScheduleUnaccountedFileDeletion(
|
||||
const std::string& file_path, const std::string& dir_to_sync,
|
||||
const bool force_bg, std::optional<int32_t> bucket) {
|
||||
TEST_SYNC_POINT_CALLBACK(
|
||||
"SstFileManagerImpl::ScheduleUnaccountedFileDeletion",
|
||||
const_cast<std::string*>(&file_path));
|
||||
return delete_scheduler_.DeleteUnaccountedFile(file_path, dir_to_sync,
|
||||
force_bg, bucket);
|
||||
}
|
||||
|
||||
void SstFileManagerImpl::WaitForEmptyTrash() {
|
||||
delete_scheduler_.WaitForEmptyTrash();
|
||||
}
|
||||
|
||||
std::optional<int32_t> SstFileManagerImpl::NewTrashBucket() {
|
||||
return delete_scheduler_.NewTrashBucket();
|
||||
}
|
||||
|
||||
void SstFileManagerImpl::WaitForEmptyTrashBucket(int32_t bucket) {
|
||||
delete_scheduler_.WaitForEmptyTrashBucket(bucket);
|
||||
}
|
||||
|
||||
void SstFileManagerImpl::OnAddFileImpl(const std::string& file_path,
|
||||
uint64_t file_size) {
|
||||
auto tracked_file = tracked_files_.find(file_path);
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
|
||||
#pragma once
|
||||
|
||||
|
||||
#include <optional>
|
||||
#include <string>
|
||||
|
||||
#include "db/compaction/compaction.h"
|
||||
|
@ -118,17 +118,40 @@ class SstFileManagerImpl : public SstFileManager {
|
|||
// not guaranteed
|
||||
bool CancelErrorRecovery(ErrorHandler* db);
|
||||
|
||||
// Mark file as trash and schedule it's deletion. If force_bg is set, it
|
||||
// Mark a file as trash and schedule its deletion. If force_bg is set, it
|
||||
// forces the file to be deleting in the background regardless of DB size,
|
||||
// except when rate limited delete is disabled
|
||||
// except when rate limited delete is disabled.
|
||||
virtual Status ScheduleFileDeletion(const std::string& file_path,
|
||||
const std::string& dir_to_sync,
|
||||
const bool force_bg = false);
|
||||
|
||||
// Wait for all files being deleteing in the background to finish or for
|
||||
// Delete an unaccounted file. The file is deleted immediately if slow
|
||||
// deletion is disabled. A file with more than 1 hard links will be deleted
|
||||
// immediately unless force_bg is set. In other cases, files will be scheduled
|
||||
// for slow deletion, and assigned to the specified bucket if a legitimate one
|
||||
// is provided. A legitimate bucket is one that is created with the
|
||||
// `NewTrashBucket` API, and for which `WaitForEmptyTrashBucket` hasn't been
|
||||
// called yet.
|
||||
virtual Status ScheduleUnaccountedFileDeletion(
|
||||
const std::string& file_path, const std::string& dir_to_sync,
|
||||
const bool force_bg = false,
|
||||
std::optional<int32_t> bucket = std::nullopt);
|
||||
|
||||
// Wait for all files being deleted in the background to finish or for
|
||||
// destructor to be called.
|
||||
virtual void WaitForEmptyTrash();
|
||||
|
||||
// Creates a new trash bucket. A legitimate bucket is only created and
|
||||
// returned when slow deletion is enabled.
|
||||
// For each bucket that is created and used, the user should also call
|
||||
// `WaitForEmptyTrashBucket` after scheduling file deletions to make sure all
|
||||
// the trash files are cleared.
|
||||
std::optional<int32_t> NewTrashBucket();
|
||||
|
||||
// Wait for all the files in the specified bucket to be deleted in the
|
||||
// background or for destructor to be called.
|
||||
virtual void WaitForEmptyTrashBucket(int32_t bucket);
|
||||
|
||||
DeleteScheduler* delete_scheduler() { return &delete_scheduler_; }
|
||||
|
||||
// Stop the error recovery background thread. This should be called only
|
||||
|
|
|
@ -61,18 +61,6 @@ enum CompactionPri : char {
|
|||
kRoundRobin = 0x4,
|
||||
};
|
||||
|
||||
// Temperature of a file. Used to pass to FileSystem for a different
|
||||
// placement and/or coding.
|
||||
// Reserve some numbers in the middle, in case we need to insert new tier
|
||||
// there.
|
||||
enum class Temperature : uint8_t {
|
||||
kUnknown = 0,
|
||||
kHot = 0x04,
|
||||
kWarm = 0x08,
|
||||
kCold = 0x0C,
|
||||
kLastTemperature,
|
||||
};
|
||||
|
||||
struct FileTemperatureAge {
|
||||
Temperature temperature = Temperature::kUnknown;
|
||||
uint64_t age = 0;
|
||||
|
@ -813,7 +801,7 @@ struct AdvancedColumnFamilyOptions {
|
|||
// If this option is set, when creating the last level files, pass this
|
||||
// temperature to FileSystem used. Should be no-op for default FileSystem
|
||||
// and users need to plug in their own FileSystem to take advantage of it.
|
||||
// When using FIFO compaction, this option is ignored.
|
||||
// Currently only compatible with universal compaction.
|
||||
//
|
||||
// Dynamically changeable through the SetOptions() API
|
||||
Temperature last_level_temperature = Temperature::kUnknown;
|
||||
|
@ -1090,6 +1078,13 @@ struct AdvancedColumnFamilyOptions {
|
|||
// Dynamically changeable through the SetOptions() API.
|
||||
uint32_t bottommost_file_compaction_delay = 0;
|
||||
|
||||
// Enables additional integrity checks during reads/scans.
|
||||
// Specifically, for skiplist-based memtables, we verify that keys visited
|
||||
// are in order. This is helpful to detect corrupted memtable keys during
|
||||
// reads. Enabling this feature incurs a performance overhead due to an
|
||||
// additional key comparison during memtable lookup.
|
||||
bool paranoid_memory_checks = false;
|
||||
|
||||
// Create ColumnFamilyOptions with default values for all fields
|
||||
AdvancedColumnFamilyOptions();
|
||||
// Create ColumnFamilyOptions from Options
|
||||
|
|
|
@ -30,6 +30,7 @@
|
|||
#include "rocksdb/port_defs.h"
|
||||
#include "rocksdb/status.h"
|
||||
#include "rocksdb/thread_status.h"
|
||||
#include "rocksdb/types.h"
|
||||
|
||||
#ifdef _WIN32
|
||||
// Windows API macro interference
|
||||
|
@ -159,6 +160,9 @@ class Env : public Customizable {
|
|||
|
||||
// Size of file in bytes
|
||||
uint64_t size_bytes;
|
||||
|
||||
// EXPERIMENTAL - only provided by some implementations
|
||||
Temperature temperature = Temperature::kUnknown;
|
||||
};
|
||||
|
||||
Env();
|
||||
|
|
|
@ -195,7 +195,9 @@ struct FileOptions : EnvOptions {
|
|||
FileOptions() : EnvOptions(), handoff_checksum_type(ChecksumType::kCRC32c) {}
|
||||
|
||||
FileOptions(const DBOptions& opts)
|
||||
: EnvOptions(opts), handoff_checksum_type(ChecksumType::kCRC32c) {}
|
||||
: EnvOptions(opts),
|
||||
temperature(opts.metadata_write_temperature),
|
||||
handoff_checksum_type(ChecksumType::kCRC32c) {}
|
||||
|
||||
FileOptions(const EnvOptions& opts)
|
||||
: EnvOptions(opts), handoff_checksum_type(ChecksumType::kCRC32c) {}
|
||||
|
@ -1952,7 +1954,8 @@ class FSDirectoryWrapper : public FSDirectory {
|
|||
// A utility routine: write "data" to the named file.
|
||||
IOStatus WriteStringToFile(FileSystem* fs, const Slice& data,
|
||||
const std::string& fname, bool should_sync = false,
|
||||
const IOOptions& io_options = IOOptions());
|
||||
const IOOptions& io_options = IOOptions(),
|
||||
const FileOptions& file_options = FileOptions());
|
||||
|
||||
// A utility routine: read contents of named file into *data
|
||||
IOStatus ReadFileToString(FileSystem* fs, const std::string& fname,
|
||||
|
|
|
@ -47,7 +47,8 @@ class FilterBitsReader;
|
|||
// structs because this is expected to be a temporary, stack-allocated object.
|
||||
struct FilterBuildingContext {
|
||||
// This constructor is for internal use only and subject to change.
|
||||
FilterBuildingContext(const BlockBasedTableOptions& table_options);
|
||||
// Keeps a reference to table_options.
|
||||
explicit FilterBuildingContext(const BlockBasedTableOptions& table_options);
|
||||
|
||||
// Options for the table being built
|
||||
const BlockBasedTableOptions& table_options;
|
||||
|
|
|
@ -194,6 +194,15 @@ class MemTableRep {
|
|||
virtual void Get(const LookupKey& k, void* callback_args,
|
||||
bool (*callback_func)(void* arg, const char* entry));
|
||||
|
||||
// Same as Get() but performs data integrity validation.
|
||||
virtual Status GetAndValidate(const LookupKey& /* k */,
|
||||
void* /* callback_args */,
|
||||
bool (* /* callback_func */)(void* arg,
|
||||
const char* entry),
|
||||
bool /*allow_data_in_error*/) {
|
||||
return Status::NotSupported("GetAndValidate() not implemented.");
|
||||
}
|
||||
|
||||
virtual uint64_t ApproximateNumEntries(const Slice& /*start_ikey*/,
|
||||
const Slice& /*end_key*/) {
|
||||
return 0;
|
||||
|
@ -235,13 +244,38 @@ class MemTableRep {
|
|||
// REQUIRES: Valid()
|
||||
virtual void Next() = 0;
|
||||
|
||||
// Advances to the next position and performs integrity validations on the
|
||||
// skip list. Iterator becomes invalid and Corruption is returned if a
|
||||
// corruption is found.
|
||||
// REQUIRES: Valid()
|
||||
virtual Status NextAndValidate(bool /* allow_data_in_errors */) {
|
||||
return Status::NotSupported("NextAndValidate() not implemented.");
|
||||
}
|
||||
|
||||
// Advances to the previous position.
|
||||
// REQUIRES: Valid()
|
||||
virtual void Prev() = 0;
|
||||
|
||||
// Advances to the previous position and performs integrity validations on
|
||||
// the skip list. Iterator becomes invalid and Corruption is returned if a
|
||||
// corruption is found.
|
||||
// REQUIRES: Valid()
|
||||
virtual Status PrevAndValidate(bool /* allow_data_in_errors */) {
|
||||
return Status::NotSupported("PrevAndValidate() not implemented.");
|
||||
}
|
||||
|
||||
// Advance to the first entry with a key >= target
|
||||
virtual void Seek(const Slice& internal_key, const char* memtable_key) = 0;
|
||||
|
||||
// Seek and perform integrity validations on the skip list.
|
||||
// Iterator becomes invalid and Corruption is returned if a
|
||||
// corruption is found.
|
||||
virtual Status SeekAndValidate(const Slice& /* internal_key */,
|
||||
const char* /* memtable_key */,
|
||||
bool /* allow_data_in_errors */) {
|
||||
return Status::NotSupported("SeekAndValidate() not implemented.");
|
||||
}
|
||||
|
||||
// retreat to the first entry with a key <= target
|
||||
virtual void SeekForPrev(const Slice& internal_key,
|
||||
const char* memtable_key) = 0;
|
||||
|
|
|
@ -512,6 +512,10 @@ class CompactionService : public Customizable {
|
|||
return CompactionServiceJobStatus::kUseLocal;
|
||||
}
|
||||
|
||||
// Optional callback function upon Installation.
|
||||
virtual void OnInstallation(const std::string& /*scheduled_job_id*/,
|
||||
CompactionServiceJobStatus /*status*/) {}
|
||||
|
||||
// Deprecated. Please implement Schedule() and Wait() API to handle remote
|
||||
// compaction
|
||||
|
||||
|
@ -1434,7 +1438,17 @@ struct DBOptions {
|
|||
// For example, if an SST or blob file referenced by the MANIFEST is missing,
|
||||
// BER might be able to find a set of files corresponding to an old "point in
|
||||
// time" version of the column family, possibly from an older MANIFEST
|
||||
// file. Some other kinds of DB files (e.g. CURRENT, LOCK, IDENTITY) are
|
||||
// file.
|
||||
// Besides complete "point in time" version, an incomplete version with
|
||||
// only a suffix of L0 files missing can also be recovered to if the
|
||||
// versioning history doesn't include an atomic flush. From the users'
|
||||
// perspective, missing a suffix of L0 files means missing the
|
||||
// user's most recently written data. So the remaining available files still
|
||||
// presents a valid point in time view, although for some previous time. It's
|
||||
// not done for atomic flush because that guarantees a consistent view across
|
||||
// column families. We cannot guarantee that if recovering an incomplete
|
||||
// version.
|
||||
// Some other kinds of DB files (e.g. CURRENT, LOCK, IDENTITY) are
|
||||
// either ignored or replaced with BER, or quietly fixed regardless of BER
|
||||
// setting. BER does require at least one valid MANIFEST to recover to a
|
||||
// non-trivial DB state, unlike `ldb repair`.
|
||||
|
@ -1566,6 +1580,16 @@ struct DBOptions {
|
|||
// Default 100ms
|
||||
uint64_t follower_catchup_retry_wait_ms = 100;
|
||||
|
||||
// When DB files other than SST, blob and WAL files are created, use this
|
||||
// filesystem temperature. (See also `wal_write_temperature` and various
|
||||
// `*_temperature` CF options.) When not `kUnknown`, this overrides any
|
||||
// temperature set by OptimizeForManifestWrite functions.
|
||||
Temperature metadata_write_temperature = Temperature::kUnknown;
|
||||
|
||||
// Use this filesystem temperature when creating WAL files. When not
|
||||
// `kUnknown`, this overrides any temperature set by OptimizeForLogWrite
|
||||
// functions.
|
||||
Temperature wal_write_temperature = Temperature::kUnknown;
|
||||
// End EXPERIMENTAL
|
||||
};
|
||||
|
||||
|
@ -2107,6 +2131,8 @@ struct CompactRangeOptions {
|
|||
// IngestExternalFileOptions is used by IngestExternalFile()
|
||||
struct IngestExternalFileOptions {
|
||||
// Can be set to true to move the files instead of copying them.
|
||||
// Note that original file links will be removed after successful ingestion,
|
||||
// unless `allow_db_generated_files` is true.
|
||||
bool move_files = false;
|
||||
// If set to true, ingestion falls back to copy when move fails.
|
||||
bool failed_move_fall_back_to_copy = true;
|
||||
|
@ -2180,22 +2206,19 @@ struct IngestExternalFileOptions {
|
|||
// XXX: "bottommost" is obsolete/confusing terminology to refer to last level
|
||||
bool fail_if_not_bottommost_level = false;
|
||||
// EXPERIMENTAL
|
||||
// If set to true, ingestion will
|
||||
// - allow the files to not be generated by SstFileWriter, and
|
||||
// - ignore cf_id mismatch between cf_id in the files and the CF they are
|
||||
// being ingested into.
|
||||
//
|
||||
// REQUIRES:
|
||||
// - files to be ingested do not overlap with existing keys.
|
||||
// - write_global_seqno = false
|
||||
// - move_files = false
|
||||
//
|
||||
// Warning: This ONLY works for SST files where all keys have sequence number
|
||||
// zero and with no duplicated user keys (this should be guaranteed if the
|
||||
// file is generated by a DB with zero as the largest sequence number).
|
||||
// We scan the entire SST files to validate sequence numbers.
|
||||
// Warning: If a DB contains ingested files generated by another DB/CF,
|
||||
// RepairDB() may not correctly recover these files. It may lose these files.
|
||||
// Enables ingestion of files not generated by SstFileWriter. When true:
|
||||
// - Allows files to be ingested when their cf_id doesn't match the CF they
|
||||
// are being ingested into.
|
||||
// - Preserves original file links after successful ingestion when
|
||||
// `move_files = true`.
|
||||
// REQUIREMENTS:
|
||||
// - Ingested files must not overlap with existing keys.
|
||||
// - `write_global_seqno` must be false.
|
||||
// - All keys in ingested files should have sequence number 0. We fail
|
||||
// ingestion if any sequence numbers is non-zero.
|
||||
// WARNING: If a DB contains ingested files generated by another DB/CF,
|
||||
// RepairDB() may not recover these files correctly, potentially leading to
|
||||
// data loss.
|
||||
bool allow_db_generated_files = false;
|
||||
};
|
||||
|
||||
|
|
|
@ -529,6 +529,11 @@ enum Tickers : uint32_t {
|
|||
// Footer corruption detected when opening an SST file for reading
|
||||
SST_FOOTER_CORRUPTION_COUNT,
|
||||
|
||||
// Counters for file read retries with the verify_and_reconstruct_read
|
||||
// file system option after detecting a checksum mismatch
|
||||
FILE_READ_CORRUPTION_RETRY_COUNT,
|
||||
FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT,
|
||||
|
||||
TICKER_ENUM_MAX
|
||||
};
|
||||
|
||||
|
|
|
@ -291,15 +291,11 @@ struct BlockBasedTableOptions {
|
|||
// Same as block_restart_interval but used for the index block.
|
||||
int index_block_restart_interval = 1;
|
||||
|
||||
// Block size for partitioned metadata. Currently applied to indexes when
|
||||
// kTwoLevelIndexSearch is used and to filters when partition_filters is used.
|
||||
// Note: Since in the current implementation the filters and index partitions
|
||||
// are aligned, an index/filter block is created when either index or filter
|
||||
// block size reaches the specified limit.
|
||||
// Note: this limit is currently applied to only index blocks; a filter
|
||||
// partition is cut right after an index block is cut
|
||||
// TODO(myabandeh): remove the note above when filter partitions are cut
|
||||
// separately
|
||||
// Target block size for partitioned metadata. Currently applied to indexes
|
||||
// when kTwoLevelIndexSearch is used and to filters when partition_filters is
|
||||
// used. When decouple_partitioned_filters=false (original behavior), there is
|
||||
// much more deviation from this target size. See the comment on
|
||||
// decouple_partitioned_filters.
|
||||
uint64_t metadata_block_size = 4096;
|
||||
|
||||
// `cache_usage_options` allows users to specify the default
|
||||
|
@ -398,6 +394,23 @@ struct BlockBasedTableOptions {
|
|||
// block cache even when cache_index_and_filter_blocks=false.
|
||||
bool partition_filters = false;
|
||||
|
||||
// When both partitioned indexes and partitioned filters are enabled,
|
||||
// this enables independent partitioning boundaries between the two. Most
|
||||
// notably, this enables these metadata blocks to hit their target size much
|
||||
// more accurately, as there is often a disparity between index sizes and
|
||||
// filter sizes. This should reduce fragmentation and metadata overheads in
|
||||
// the block cache, as well as treat blocks more fairly for cache eviction
|
||||
// purposes.
|
||||
//
|
||||
// There are no SST format compatibility issues with this option. (All
|
||||
// versions of RocksDB able to read partitioned filters are able to read
|
||||
// decoupled partitioned filters.)
|
||||
//
|
||||
// decouple_partitioned_filters = false is the original behavior, because of
|
||||
// limitations in the initial implementation, and the new behavior
|
||||
// decouple_partitioned_filters = true is expected to become the new default.
|
||||
bool decouple_partitioned_filters = false;
|
||||
|
||||
// Option to generate Bloom/Ribbon filters that minimize memory
|
||||
// internal fragmentation.
|
||||
//
|
||||
|
@ -679,6 +692,11 @@ struct BlockBasedTablePropertyNames {
|
|||
static const std::string kWholeKeyFiltering;
|
||||
// value is "1" for true and "0" for false.
|
||||
static const std::string kPrefixFiltering;
|
||||
// Set to "1" when partitioned filters are decoupled from partitioned indexes.
|
||||
// This metadata is recorded in case a read-time optimization for coupled
|
||||
// filter+index partitioning is ever developed; that optimization/assumption
|
||||
// would be disabled when this is set.
|
||||
static const std::string kDecoupledPartitionedFilters;
|
||||
};
|
||||
|
||||
// Create default block based table factory.
|
||||
|
|
|
@ -74,6 +74,7 @@ struct TablePropertiesNames {
|
|||
static const std::string kSequenceNumberTimeMapping;
|
||||
static const std::string kTailStartOffset;
|
||||
static const std::string kUserDefinedTimestampsPersisted;
|
||||
static const std::string kKeyLargestSeqno;
|
||||
};
|
||||
|
||||
// `TablePropertiesCollector` provides the mechanism for users to collect
|
||||
|
@ -125,6 +126,8 @@ class TablePropertiesCollector {
|
|||
// Finish() will be called when a table has already been built and is ready
|
||||
// for writing the properties block.
|
||||
// It will be called only once by RocksDB internal.
|
||||
// When the returned Status is not OK, the collected properties will not be
|
||||
// written to the file's property block.
|
||||
//
|
||||
// @params properties User will add their collected statistics to
|
||||
// `properties`.
|
||||
|
@ -132,6 +135,7 @@ class TablePropertiesCollector {
|
|||
|
||||
// Return the human-readable properties, where the key is property name and
|
||||
// the value is the human-readable form of value.
|
||||
// Returned properties are used for logging.
|
||||
// It will only be called after Finish() has been called by RocksDB internal.
|
||||
virtual UserCollectedProperties GetReadableProperties() const = 0;
|
||||
|
||||
|
@ -290,6 +294,12 @@ struct TableProperties {
|
|||
// it's explicitly written to meta properties block.
|
||||
uint64_t user_defined_timestamps_persisted = 1;
|
||||
|
||||
// The largest sequence number of keys in this file.
|
||||
// UINT64_MAX means unknown.
|
||||
// Only written to properties block if known (should be known unless the
|
||||
// table is empty).
|
||||
uint64_t key_largest_seqno = UINT64_MAX;
|
||||
|
||||
// DB identity
|
||||
// db_id is an identifier generated the first time the DB is created
|
||||
// If DB identity is unset or unassigned, `db_id` will be an empty string.
|
||||
|
|
|
@ -110,4 +110,16 @@ enum class WriteStallCondition {
|
|||
kNormal,
|
||||
};
|
||||
|
||||
// Temperature of a file. Used to pass to FileSystem for a different
|
||||
// placement and/or coding.
|
||||
// Reserve some numbers in the middle, in case we need to insert new tier
|
||||
// there.
|
||||
enum class Temperature : uint8_t {
|
||||
kUnknown = 0,
|
||||
kHot = 0x04,
|
||||
kWarm = 0x08,
|
||||
kCold = 0x0C,
|
||||
kLastTemperature,
|
||||
};
|
||||
|
||||
} // namespace ROCKSDB_NAMESPACE
|
||||
|
|
|
@ -323,6 +323,22 @@ struct TransactionOptions {
|
|||
// description. If a negative value is specified, then the default value from
|
||||
// TransactionDBOptions is used.
|
||||
int64_t write_batch_flush_threshold = -1;
|
||||
|
||||
// DO NOT USE.
|
||||
// This is only a temporary option dedicated for MyRocks that will soon be
|
||||
// removed.
|
||||
// In normal use cases, meta info like column family's timestamp size is
|
||||
// tracked at the transaction layer, so it's not necessary and even
|
||||
// detrimental to track such info inside the internal WriteBatch because it
|
||||
// may let anti-patterns like bypassing Transaction write APIs and directly
|
||||
// write to its internal `WriteBatch` retrieved like this:
|
||||
// https://github.com/facebook/mysql-5.6/blob/fb-mysql-8.0.32/storage/rocksdb/ha_rocksdb.cc#L4949-L4950
|
||||
// Setting this option to true will keep aforementioned use case continue to
|
||||
// work before it's refactored out.
|
||||
// When this flag is enabled, we also intentionally only track the timestamp
|
||||
// size in APIs that MyRocks currently are using, including Put, Merge, Delete
|
||||
// DeleteRange, SingleDelete.
|
||||
bool write_batch_track_timestamp_size = false;
|
||||
};
|
||||
|
||||
// The per-write optimizations that do not involve transactions. TransactionDB
|
||||
|
|
|
@ -12,7 +12,7 @@
|
|||
// NOTE: in 'main' development branch, this should be the *next*
|
||||
// minor or major version number planned for release.
|
||||
#define ROCKSDB_MAJOR 9
|
||||
#define ROCKSDB_MINOR 6
|
||||
#define ROCKSDB_MINOR 7
|
||||
#define ROCKSDB_PATCH 0
|
||||
|
||||
// Do not use these. We made the mistake of declaring macros starting with
|
||||
|
|
|
@ -30,7 +30,7 @@
|
|||
#include <functional>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <unordered_map>
|
||||
|
||||
#include "rocksdb/status.h"
|
||||
#include "rocksdb/write_batch_base.h"
|
||||
|
@ -437,6 +437,30 @@ class WriteBatch : public WriteBatchBase {
|
|||
Status UpdateTimestamps(const Slice& ts,
|
||||
std::function<size_t(uint32_t /*cf*/)> ts_sz_func);
|
||||
|
||||
// TODO: remove these internal APIs after MyRocks refactor to not directly
|
||||
// write to a `WriteBatch` retrieved from `Transaction` via
|
||||
// `Transaction::GetWriteBatch`.
|
||||
|
||||
void SetTrackTimestampSize(bool track_timestamp_size) {
|
||||
track_timestamp_size_ = track_timestamp_size;
|
||||
}
|
||||
|
||||
inline void MaybeTrackTimestampSize(uint32_t column_family_id, size_t ts_sz) {
|
||||
if (!track_timestamp_size_) {
|
||||
return;
|
||||
}
|
||||
auto iter = cf_id_to_ts_sz_.find(column_family_id);
|
||||
if (iter == cf_id_to_ts_sz_.end()) {
|
||||
cf_id_to_ts_sz_.emplace(column_family_id, ts_sz);
|
||||
}
|
||||
}
|
||||
|
||||
// Return a mapping from column family id to timestamp size of all the column
|
||||
// families involved in this WriteBatch.
|
||||
const std::unordered_map<uint32_t, size_t>& GetColumnFamilyToTimestampSize() {
|
||||
return cf_id_to_ts_sz_;
|
||||
}
|
||||
|
||||
// Verify the per-key-value checksums of this write batch.
|
||||
// Corruption status will be returned if the verification fails.
|
||||
// If this write batch does not have per-key-value checksum,
|
||||
|
@ -511,6 +535,10 @@ class WriteBatch : public WriteBatchBase {
|
|||
|
||||
size_t default_cf_ts_sz_ = 0;
|
||||
|
||||
bool track_timestamp_size_ = false;
|
||||
|
||||
std::unordered_map<uint32_t, size_t> cf_id_to_ts_sz_;
|
||||
|
||||
protected:
|
||||
std::string rep_; // See comment in write_batch.cc for the format of rep_
|
||||
};
|
||||
|
|
|
@ -5317,6 +5317,10 @@ class TickerTypeJni {
|
|||
return -0x53;
|
||||
case ROCKSDB_NAMESPACE::Tickers::SST_FOOTER_CORRUPTION_COUNT:
|
||||
return -0x55;
|
||||
case ROCKSDB_NAMESPACE::Tickers::FILE_READ_CORRUPTION_RETRY_COUNT:
|
||||
return -0x56;
|
||||
case ROCKSDB_NAMESPACE::Tickers::FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT:
|
||||
return -0x57;
|
||||
case ROCKSDB_NAMESPACE::Tickers::TICKER_ENUM_MAX:
|
||||
// -0x54 is the max value at this time. Since these values are exposed
|
||||
// directly to Java clients, we'll keep the value the same till the next
|
||||
|
@ -5774,6 +5778,11 @@ class TickerTypeJni {
|
|||
return ROCKSDB_NAMESPACE::Tickers::PREFETCH_HITS;
|
||||
case -0x55:
|
||||
return ROCKSDB_NAMESPACE::Tickers::SST_FOOTER_CORRUPTION_COUNT;
|
||||
case -0x56:
|
||||
return ROCKSDB_NAMESPACE::Tickers::FILE_READ_CORRUPTION_RETRY_COUNT;
|
||||
case -0x57:
|
||||
return ROCKSDB_NAMESPACE::Tickers::
|
||||
FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT;
|
||||
case -0x54:
|
||||
// -0x54 is the max value at this time. Since these values are exposed
|
||||
// directly to Java clients, we'll keep the value the same till the next
|
||||
|
|
|
@ -878,6 +878,10 @@ public enum TickerType {
|
|||
|
||||
SST_FOOTER_CORRUPTION_COUNT((byte) -0x55),
|
||||
|
||||
FILE_READ_CORRUPTION_RETRY_COUNT((byte) -0x56),
|
||||
|
||||
FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT((byte) -0x57),
|
||||
|
||||
TICKER_ENUM_MAX((byte) -0x54);
|
||||
|
||||
private final byte value;
|
||||
|
|
|
@ -52,6 +52,7 @@
|
|||
#include "port/likely.h"
|
||||
#include "port/port.h"
|
||||
#include "rocksdb/slice.h"
|
||||
#include "test_util/sync_point.h"
|
||||
#include "util/coding.h"
|
||||
#include "util/random.h"
|
||||
|
||||
|
@ -169,13 +170,20 @@ class InlineSkipList {
|
|||
// REQUIRES: Valid()
|
||||
void Next();
|
||||
|
||||
[[nodiscard]] Status NextAndValidate(bool allow_data_in_errors);
|
||||
|
||||
// Advances to the previous position.
|
||||
// REQUIRES: Valid()
|
||||
void Prev();
|
||||
|
||||
[[nodiscard]] Status PrevAndValidate(bool allow_data_in_errors);
|
||||
|
||||
// Advance to the first entry with a key >= target
|
||||
void Seek(const char* target);
|
||||
|
||||
[[nodiscard]] Status SeekAndValidate(const char* target,
|
||||
bool allow_data_in_errors);
|
||||
|
||||
// Retreat to the last entry with a key <= target
|
||||
void SeekForPrev(const char* target);
|
||||
|
||||
|
@ -237,21 +245,20 @@ class InlineSkipList {
|
|||
bool KeyIsAfterNode(const DecodedKey& key, Node* n) const;
|
||||
|
||||
// Returns the earliest node with a key >= key.
|
||||
// Return nullptr if there is no such node.
|
||||
Node* FindGreaterOrEqual(const char* key) const;
|
||||
// Returns nullptr if there is no such node.
|
||||
// @param out_of_order_node If not null, will validate the order of visited
|
||||
// nodes. If a pair of out-of-order nodes n1 and n2 are found, n1 will be
|
||||
// returned and *out_of_order_node will be set to n2.
|
||||
Node* FindGreaterOrEqual(const char* key, Node** out_of_order_node) const;
|
||||
|
||||
// Return the latest node with a key < key.
|
||||
// Return head_ if there is no such node.
|
||||
// Returns the latest node with a key < key.
|
||||
// Returns head_ if there is no such node.
|
||||
// Fills prev[level] with pointer to previous node at "level" for every
|
||||
// level in [0..max_height_-1], if prev is non-null.
|
||||
Node* FindLessThan(const char* key, Node** prev = nullptr) const;
|
||||
|
||||
// Return the latest node with a key < key on bottom_level. Start searching
|
||||
// from root node on the level below top_level.
|
||||
// Fills prev[level] with pointer to previous node at "level" for every
|
||||
// level in [bottom_level..top_level-1], if prev is non-null.
|
||||
Node* FindLessThan(const char* key, Node** prev, Node* root, int top_level,
|
||||
int bottom_level) const;
|
||||
// @param out_of_order_node If not null, will validate the order of visited
|
||||
// nodes. If a pair of out-of-order nodes n1 and n2 are found, n1 will be
|
||||
// returned and *out_of_order_node will be set to n2.
|
||||
Node* FindLessThan(const char* key, Node** out_of_order_node) const;
|
||||
|
||||
// Return the last node in the list.
|
||||
// Return head_ if list is empty.
|
||||
|
@ -274,6 +281,8 @@ class InlineSkipList {
|
|||
// lowest_level (inclusive).
|
||||
void RecomputeSpliceLevels(const DecodedKey& key, Splice* splice,
|
||||
int recompute_level);
|
||||
|
||||
static Status Corruption(Node* prev, Node* next, bool allow_data_in_errors);
|
||||
};
|
||||
|
||||
// Implementation details follow
|
||||
|
@ -392,20 +401,68 @@ inline void InlineSkipList<Comparator>::Iterator::Next() {
|
|||
node_ = node_->Next(0);
|
||||
}
|
||||
|
||||
template <class Comparator>
|
||||
inline Status InlineSkipList<Comparator>::Iterator::NextAndValidate(
|
||||
bool allow_data_in_errors) {
|
||||
assert(Valid());
|
||||
Node* prev_node = node_;
|
||||
node_ = node_->Next(0);
|
||||
// Verify that keys are increasing.
|
||||
if (prev_node != list_->head_ && node_ != nullptr &&
|
||||
list_->compare_(prev_node->Key(), node_->Key()) >= 0) {
|
||||
Node* node = node_;
|
||||
// invalidates the iterator
|
||||
node_ = nullptr;
|
||||
return Corruption(prev_node, node, allow_data_in_errors);
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
template <class Comparator>
|
||||
inline void InlineSkipList<Comparator>::Iterator::Prev() {
|
||||
// Instead of using explicit "prev" links, we just search for the
|
||||
// last node that falls before key.
|
||||
assert(Valid());
|
||||
node_ = list_->FindLessThan(node_->Key());
|
||||
node_ = list_->FindLessThan(node_->Key(), nullptr);
|
||||
if (node_ == list_->head_) {
|
||||
node_ = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
template <class Comparator>
|
||||
inline Status InlineSkipList<Comparator>::Iterator::PrevAndValidate(
|
||||
const bool allow_data_in_errors) {
|
||||
assert(Valid());
|
||||
// Skip list validation is done in FindLessThan().
|
||||
Node* out_of_order_node = nullptr;
|
||||
node_ = list_->FindLessThan(node_->Key(), &out_of_order_node);
|
||||
if (out_of_order_node) {
|
||||
Node* node = node_;
|
||||
node_ = nullptr;
|
||||
return Corruption(node, out_of_order_node, allow_data_in_errors);
|
||||
}
|
||||
if (node_ == list_->head_) {
|
||||
node_ = nullptr;
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
template <class Comparator>
|
||||
inline void InlineSkipList<Comparator>::Iterator::Seek(const char* target) {
|
||||
node_ = list_->FindGreaterOrEqual(target);
|
||||
node_ = list_->FindGreaterOrEqual(target, nullptr);
|
||||
}
|
||||
|
||||
template <class Comparator>
|
||||
inline Status InlineSkipList<Comparator>::Iterator::SeekAndValidate(
|
||||
const char* target, const bool allow_data_in_errors) {
|
||||
Node* out_of_order_node = nullptr;
|
||||
node_ = list_->FindGreaterOrEqual(target, &out_of_order_node);
|
||||
if (out_of_order_node) {
|
||||
Node* node = node_;
|
||||
node_ = nullptr;
|
||||
return Corruption(node, out_of_order_node, allow_data_in_errors);
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
template <class Comparator>
|
||||
|
@ -448,6 +505,7 @@ int InlineSkipList<Comparator>::RandomHeight() {
|
|||
rnd->Next() < kScaledInverseBranching_) {
|
||||
height++;
|
||||
}
|
||||
TEST_SYNC_POINT_CALLBACK("InlineSkipList::RandomHeight::height", &height);
|
||||
assert(height > 0);
|
||||
assert(height <= kMaxHeight_);
|
||||
assert(height <= kMaxPossibleHeight);
|
||||
|
@ -472,7 +530,8 @@ bool InlineSkipList<Comparator>::KeyIsAfterNode(const DecodedKey& key,
|
|||
|
||||
template <class Comparator>
|
||||
typename InlineSkipList<Comparator>::Node*
|
||||
InlineSkipList<Comparator>::FindGreaterOrEqual(const char* key) const {
|
||||
InlineSkipList<Comparator>::FindGreaterOrEqual(
|
||||
const char* key, Node** const out_of_order_node) const {
|
||||
// Note: It looks like we could reduce duplication by implementing
|
||||
// this function as FindLessThan(key)->Next(0), but we wouldn't be able
|
||||
// to exit early on equality and the result wouldn't even be correct.
|
||||
|
@ -486,6 +545,11 @@ InlineSkipList<Comparator>::FindGreaterOrEqual(const char* key) const {
|
|||
Node* next = x->Next(level);
|
||||
if (next != nullptr) {
|
||||
PREFETCH(next->Next(level), 0, 1);
|
||||
if (out_of_order_node && x != head_ &&
|
||||
compare_(x->Key(), next->Key()) >= 0) {
|
||||
*out_of_order_node = next;
|
||||
return x;
|
||||
}
|
||||
}
|
||||
// Make sure the lists are sorted
|
||||
assert(x == head_ || next == nullptr || KeyIsAfterNode(next->Key(), x));
|
||||
|
@ -509,18 +573,11 @@ InlineSkipList<Comparator>::FindGreaterOrEqual(const char* key) const {
|
|||
|
||||
template <class Comparator>
|
||||
typename InlineSkipList<Comparator>::Node*
|
||||
InlineSkipList<Comparator>::FindLessThan(const char* key, Node** prev) const {
|
||||
return FindLessThan(key, prev, head_, GetMaxHeight(), 0);
|
||||
}
|
||||
|
||||
template <class Comparator>
|
||||
typename InlineSkipList<Comparator>::Node*
|
||||
InlineSkipList<Comparator>::FindLessThan(const char* key, Node** prev,
|
||||
Node* root, int top_level,
|
||||
int bottom_level) const {
|
||||
assert(top_level > bottom_level);
|
||||
int level = top_level - 1;
|
||||
Node* x = root;
|
||||
InlineSkipList<Comparator>::FindLessThan(const char* key,
|
||||
Node** const out_of_order_node) const {
|
||||
int level = GetMaxHeight() - 1;
|
||||
assert(level >= 0);
|
||||
Node* x = head_;
|
||||
// KeyIsAfter(key, last_not_after) is definitely false
|
||||
Node* last_not_after = nullptr;
|
||||
const DecodedKey key_decoded = compare_.decode_key(key);
|
||||
|
@ -529,6 +586,11 @@ InlineSkipList<Comparator>::FindLessThan(const char* key, Node** prev,
|
|||
Node* next = x->Next(level);
|
||||
if (next != nullptr) {
|
||||
PREFETCH(next->Next(level), 0, 1);
|
||||
if (out_of_order_node && x != head_ &&
|
||||
compare_(x->Key(), next->Key()) >= 0) {
|
||||
*out_of_order_node = next;
|
||||
return x;
|
||||
}
|
||||
}
|
||||
assert(x == head_ || next == nullptr || KeyIsAfterNode(next->Key(), x));
|
||||
assert(x == head_ || KeyIsAfterNode(key_decoded, x));
|
||||
|
@ -537,10 +599,7 @@ InlineSkipList<Comparator>::FindLessThan(const char* key, Node** prev,
|
|||
assert(next != nullptr);
|
||||
x = next;
|
||||
} else {
|
||||
if (prev != nullptr) {
|
||||
prev[level] = x;
|
||||
}
|
||||
if (level == bottom_level) {
|
||||
if (level == 0) {
|
||||
return x;
|
||||
} else {
|
||||
// Switch to next list, reuse KeyIsAfterNode() result
|
||||
|
@ -910,12 +969,12 @@ bool InlineSkipList<Comparator>::Insert(const char* key, Splice* splice,
|
|||
while (true) {
|
||||
// Checking for duplicate keys on the level 0 is sufficient
|
||||
if (UNLIKELY(i == 0 && splice->next_[i] != nullptr &&
|
||||
compare_(x->Key(), splice->next_[i]->Key()) >= 0)) {
|
||||
compare_(splice->next_[i]->Key(), key_decoded) <= 0)) {
|
||||
// duplicate key
|
||||
return false;
|
||||
}
|
||||
if (UNLIKELY(i == 0 && splice->prev_[i] != head_ &&
|
||||
compare_(splice->prev_[i]->Key(), x->Key()) >= 0)) {
|
||||
compare_(splice->prev_[i]->Key(), key_decoded) >= 0)) {
|
||||
// duplicate key
|
||||
return false;
|
||||
}
|
||||
|
@ -953,12 +1012,12 @@ bool InlineSkipList<Comparator>::Insert(const char* key, Splice* splice,
|
|||
}
|
||||
// Checking for duplicate keys on the level 0 is sufficient
|
||||
if (UNLIKELY(i == 0 && splice->next_[i] != nullptr &&
|
||||
compare_(x->Key(), splice->next_[i]->Key()) >= 0)) {
|
||||
compare_(splice->next_[i]->Key(), key_decoded) <= 0)) {
|
||||
// duplicate key
|
||||
return false;
|
||||
}
|
||||
if (UNLIKELY(i == 0 && splice->prev_[i] != head_ &&
|
||||
compare_(splice->prev_[i]->Key(), x->Key()) >= 0)) {
|
||||
compare_(splice->prev_[i]->Key(), key_decoded) >= 0)) {
|
||||
// duplicate key
|
||||
return false;
|
||||
}
|
||||
|
@ -999,7 +1058,7 @@ bool InlineSkipList<Comparator>::Insert(const char* key, Splice* splice,
|
|||
|
||||
template <class Comparator>
|
||||
bool InlineSkipList<Comparator>::Contains(const char* key) const {
|
||||
Node* x = FindGreaterOrEqual(key);
|
||||
Node* x = FindGreaterOrEqual(key, nullptr);
|
||||
if (x != nullptr && Equal(key, x->Key())) {
|
||||
return true;
|
||||
} else {
|
||||
|
@ -1048,4 +1107,14 @@ void InlineSkipList<Comparator>::TEST_Validate() const {
|
|||
}
|
||||
}
|
||||
|
||||
template <class Comparator>
|
||||
Status InlineSkipList<Comparator>::Corruption(Node* prev, Node* next,
|
||||
bool allow_data_in_errors) {
|
||||
std::string msg = "Out-of-order keys found in skiplist.";
|
||||
if (allow_data_in_errors) {
|
||||
msg.append(" prev key: " + Slice(prev->Key()).ToString(true));
|
||||
msg.append(" next key: " + Slice(next->Key()).ToString(true));
|
||||
}
|
||||
return Status::Corruption(msg);
|
||||
}
|
||||
} // namespace ROCKSDB_NAMESPACE
|
||||
|
|
|
@ -92,6 +92,20 @@ class SkipListRep : public MemTableRep {
|
|||
}
|
||||
}
|
||||
|
||||
Status GetAndValidate(const LookupKey& k, void* callback_args,
|
||||
bool (*callback_func)(void* arg, const char* entry),
|
||||
bool allow_data_in_errors) override {
|
||||
SkipListRep::Iterator iter(&skip_list_);
|
||||
Slice dummy_slice;
|
||||
Status status = iter.SeekAndValidate(dummy_slice, k.memtable_key().data(),
|
||||
allow_data_in_errors);
|
||||
for (; iter.Valid() && status.ok() &&
|
||||
callback_func(callback_args, iter.key());
|
||||
status = iter.NextAndValidate(allow_data_in_errors)) {
|
||||
}
|
||||
return status;
|
||||
}
|
||||
|
||||
uint64_t ApproximateNumEntries(const Slice& start_ikey,
|
||||
const Slice& end_ikey) override {
|
||||
std::string tmp;
|
||||
|
@ -181,15 +195,24 @@ class SkipListRep : public MemTableRep {
|
|||
|
||||
// Returns the key at the current position.
|
||||
// REQUIRES: Valid()
|
||||
const char* key() const override { return iter_.key(); }
|
||||
const char* key() const override {
|
||||
assert(Valid());
|
||||
return iter_.key();
|
||||
}
|
||||
|
||||
// Advances to the next position.
|
||||
// REQUIRES: Valid()
|
||||
void Next() override { iter_.Next(); }
|
||||
void Next() override {
|
||||
assert(Valid());
|
||||
iter_.Next();
|
||||
}
|
||||
|
||||
// Advances to the previous position.
|
||||
// REQUIRES: Valid()
|
||||
void Prev() override { iter_.Prev(); }
|
||||
void Prev() override {
|
||||
assert(Valid());
|
||||
iter_.Prev();
|
||||
}
|
||||
|
||||
// Advance to the first entry with a key >= target
|
||||
void Seek(const Slice& user_key, const char* memtable_key) override {
|
||||
|
@ -219,6 +242,26 @@ class SkipListRep : public MemTableRep {
|
|||
// Final state of iterator is Valid() iff list is not empty.
|
||||
void SeekToLast() override { iter_.SeekToLast(); }
|
||||
|
||||
Status NextAndValidate(bool allow_data_in_errors) override {
|
||||
assert(Valid());
|
||||
return iter_.NextAndValidate(allow_data_in_errors);
|
||||
}
|
||||
|
||||
Status SeekAndValidate(const Slice& user_key, const char* memtable_key,
|
||||
bool allow_data_in_errors) override {
|
||||
if (memtable_key != nullptr) {
|
||||
return iter_.SeekAndValidate(memtable_key, allow_data_in_errors);
|
||||
} else {
|
||||
return iter_.SeekAndValidate(EncodeKey(&tmp_, user_key),
|
||||
allow_data_in_errors);
|
||||
}
|
||||
}
|
||||
|
||||
Status PrevAndValidate(bool allow_data_in_error) override {
|
||||
assert(Valid());
|
||||
return iter_.PrevAndValidate(allow_data_in_error);
|
||||
}
|
||||
|
||||
protected:
|
||||
std::string tmp_; // For passing to EncodeKey
|
||||
};
|
||||
|
|
|
@ -266,6 +266,10 @@ const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
|
|||
{PREFETCH_BYTES_USEFUL, "rocksdb.prefetch.bytes.useful"},
|
||||
{PREFETCH_HITS, "rocksdb.prefetch.hits"},
|
||||
{SST_FOOTER_CORRUPTION_COUNT, "rocksdb.footer.corruption.count"},
|
||||
{FILE_READ_CORRUPTION_RETRY_COUNT,
|
||||
"rocksdb.file.read.corruption.retry.count"},
|
||||
{FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT,
|
||||
"rocksdb.file.read.corruption.retry.success.count"},
|
||||
};
|
||||
|
||||
const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap = {
|
||||
|
|
|
@ -531,6 +531,10 @@ static std::unordered_map<std::string, OptionTypeInfo>
|
|||
{offsetof(struct MutableCFOptions, block_protection_bytes_per_key),
|
||||
OptionType::kUInt8T, OptionVerificationType::kNormal,
|
||||
OptionTypeFlags::kMutable}},
|
||||
{"paranoid_memory_checks",
|
||||
{offsetof(struct MutableCFOptions, paranoid_memory_checks),
|
||||
OptionType::kBoolean, OptionVerificationType::kNormal,
|
||||
OptionTypeFlags::kMutable}},
|
||||
{kOptNameCompOpts,
|
||||
OptionTypeInfo::Struct(
|
||||
kOptNameCompOpts, &compression_options_type_info,
|
||||
|
@ -1104,6 +1108,8 @@ void MutableCFOptions::Dump(Logger* log) const {
|
|||
ttl);
|
||||
ROCKS_LOG_INFO(log, " periodic_compaction_seconds: %" PRIu64,
|
||||
periodic_compaction_seconds);
|
||||
ROCKS_LOG_INFO(log, " paranoid_memory_checks: %d",
|
||||
paranoid_memory_checks);
|
||||
std::string result;
|
||||
char buf[10];
|
||||
for (const auto m : max_bytes_for_level_multiplier_additional) {
|
||||
|
|
|
@ -168,6 +168,7 @@ struct MutableCFOptions {
|
|||
memtable_protection_bytes_per_key(
|
||||
options.memtable_protection_bytes_per_key),
|
||||
block_protection_bytes_per_key(options.block_protection_bytes_per_key),
|
||||
paranoid_memory_checks(options.paranoid_memory_checks),
|
||||
sample_for_compression(
|
||||
options.sample_for_compression), // TODO: is 0 fine here?
|
||||
compression_per_level(options.compression_per_level),
|
||||
|
@ -317,6 +318,7 @@ struct MutableCFOptions {
|
|||
Temperature default_write_temperature;
|
||||
uint32_t memtable_protection_bytes_per_key;
|
||||
uint8_t block_protection_bytes_per_key;
|
||||
bool paranoid_memory_checks;
|
||||
|
||||
uint64_t sample_for_compression;
|
||||
std::vector<CompressionType> compression_per_level;
|
||||
|
|
|
@ -576,6 +576,14 @@ static std::unordered_map<std::string, OptionTypeInfo>
|
|||
{offsetof(struct ImmutableDBOptions, follower_catchup_retry_wait_ms),
|
||||
OptionType::kUInt64T, OptionVerificationType::kNormal,
|
||||
OptionTypeFlags::kNone}},
|
||||
{"metadata_write_temperature",
|
||||
{offsetof(struct ImmutableDBOptions, metadata_write_temperature),
|
||||
OptionType::kTemperature, OptionVerificationType::kNormal,
|
||||
OptionTypeFlags::kNone}},
|
||||
{"wal_write_temperature",
|
||||
{offsetof(struct ImmutableDBOptions, wal_write_temperature),
|
||||
OptionType::kTemperature, OptionVerificationType::kNormal,
|
||||
OptionTypeFlags::kNone}},
|
||||
};
|
||||
|
||||
const std::string OptionsHelper::kDBOptionsName = "DBOptions";
|
||||
|
@ -778,7 +786,9 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options)
|
|||
follower_refresh_catchup_period_ms(
|
||||
options.follower_refresh_catchup_period_ms),
|
||||
follower_catchup_retry_count(options.follower_catchup_retry_count),
|
||||
follower_catchup_retry_wait_ms(options.follower_catchup_retry_wait_ms) {
|
||||
follower_catchup_retry_wait_ms(options.follower_catchup_retry_wait_ms),
|
||||
metadata_write_temperature(options.metadata_write_temperature),
|
||||
wal_write_temperature(options.wal_write_temperature) {
|
||||
fs = env->GetFileSystem();
|
||||
clock = env->GetSystemClock().get();
|
||||
logger = info_log.get();
|
||||
|
@ -956,6 +966,10 @@ void ImmutableDBOptions::Dump(Logger* log) const {
|
|||
db_host_id.c_str());
|
||||
ROCKS_LOG_HEADER(log, " Options.enforce_single_del_contracts: %s",
|
||||
enforce_single_del_contracts ? "true" : "false");
|
||||
ROCKS_LOG_HEADER(log, " Options.metadata_write_temperature: %s",
|
||||
temperature_to_string[metadata_write_temperature].c_str());
|
||||
ROCKS_LOG_HEADER(log, " Options.wal_write_temperature: %s",
|
||||
temperature_to_string[wal_write_temperature].c_str());
|
||||
}
|
||||
|
||||
bool ImmutableDBOptions::IsWalDirSameAsDBPath() const {
|
||||
|
|
|
@ -103,6 +103,8 @@ struct ImmutableDBOptions {
|
|||
uint64_t follower_refresh_catchup_period_ms;
|
||||
uint64_t follower_catchup_retry_count;
|
||||
uint64_t follower_catchup_retry_wait_ms;
|
||||
Temperature metadata_write_temperature;
|
||||
Temperature wal_write_temperature;
|
||||
|
||||
// Beginning convenience/helper objects that are not part of the base
|
||||
// DBOptions
|
||||
|
|
|
@ -180,6 +180,15 @@ DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options,
|
|||
options.enforce_single_del_contracts =
|
||||
immutable_db_options.enforce_single_del_contracts;
|
||||
options.daily_offpeak_time_utc = mutable_db_options.daily_offpeak_time_utc;
|
||||
options.follower_refresh_catchup_period_ms =
|
||||
immutable_db_options.follower_refresh_catchup_period_ms;
|
||||
options.follower_catchup_retry_count =
|
||||
immutable_db_options.follower_catchup_retry_count;
|
||||
options.follower_catchup_retry_wait_ms =
|
||||
immutable_db_options.follower_catchup_retry_wait_ms;
|
||||
options.metadata_write_temperature =
|
||||
immutable_db_options.metadata_write_temperature;
|
||||
options.wal_write_temperature = immutable_db_options.wal_write_temperature;
|
||||
return options;
|
||||
}
|
||||
|
||||
|
@ -213,6 +222,7 @@ void UpdateColumnFamilyOptions(const MutableCFOptions& moptions,
|
|||
moptions.memtable_protection_bytes_per_key;
|
||||
cf_opts->block_protection_bytes_per_key =
|
||||
moptions.block_protection_bytes_per_key;
|
||||
cf_opts->paranoid_memory_checks = moptions.paranoid_memory_checks;
|
||||
cf_opts->bottommost_file_compaction_delay =
|
||||
moptions.bottommost_file_compaction_delay;
|
||||
|
||||
|
|
|
@ -69,8 +69,9 @@ Status PersistRocksDBOptions(const WriteOptions& write_options,
|
|||
}
|
||||
std::unique_ptr<FSWritableFile> wf;
|
||||
|
||||
Status s =
|
||||
fs->NewWritableFile(file_name, FileOptions(), &wf, nullptr);
|
||||
FileOptions file_options;
|
||||
file_options.temperature = db_opt.metadata_write_temperature;
|
||||
Status s = fs->NewWritableFile(file_name, file_options, &wf, nullptr);
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
|
|
|
@ -188,6 +188,7 @@ TEST_F(OptionsSettableTest, BlockBasedTableOptionsAllFieldsSettable) {
|
|||
"block_size_deviation=8;block_restart_interval=4; "
|
||||
"metadata_block_size=1024;"
|
||||
"partition_filters=false;"
|
||||
"decouple_partitioned_filters=true;"
|
||||
"optimize_filters_for_memory=true;"
|
||||
"use_delta_encoding=true;"
|
||||
"index_block_restart_interval=4;"
|
||||
|
@ -366,7 +367,12 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) {
|
|||
"lowest_used_cache_tier=kNonVolatileBlockTier;"
|
||||
"allow_data_in_errors=false;"
|
||||
"enforce_single_del_contracts=false;"
|
||||
"daily_offpeak_time_utc=08:30-19:00;",
|
||||
"daily_offpeak_time_utc=08:30-19:00;"
|
||||
"follower_refresh_catchup_period_ms=123;"
|
||||
"follower_catchup_retry_count=456;"
|
||||
"follower_catchup_retry_wait_ms=789;"
|
||||
"metadata_write_temperature=kCold;"
|
||||
"wal_write_temperature=kHot;",
|
||||
new_options));
|
||||
|
||||
ASSERT_EQ(unset_bytes_base, NumUnsetBytes(new_options_ptr, sizeof(DBOptions),
|
||||
|
@ -567,7 +573,8 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) {
|
|||
"block_protection_bytes_per_key=1;"
|
||||
"memtable_max_range_deletions=999999;"
|
||||
"bottommost_file_compaction_delay=7200;"
|
||||
"uncache_aggressiveness=1234;",
|
||||
"uncache_aggressiveness=1234;"
|
||||
"paranoid_memory_checks=1;",
|
||||
new_options));
|
||||
|
||||
ASSERT_NE(new_options->blob_cache.get(), nullptr);
|
||||
|
|
|
@ -96,7 +96,8 @@ FilterBlockBuilder* CreateFilterBlockBuilder(
|
|||
mopt.prefix_extractor.get(), table_opt.whole_key_filtering,
|
||||
filter_bits_builder, table_opt.index_block_restart_interval,
|
||||
use_delta_encoding_for_index_values, p_index_builder, partition_size,
|
||||
ts_sz, persist_user_defined_timestamps);
|
||||
ts_sz, persist_user_defined_timestamps,
|
||||
table_opt.decouple_partitioned_filters);
|
||||
} else {
|
||||
return new FullFilterBlockBuilder(mopt.prefix_extractor.get(),
|
||||
table_opt.whole_key_filtering,
|
||||
|
@ -213,10 +214,11 @@ class BlockBasedTableBuilder::BlockBasedTablePropertiesCollector
|
|||
public:
|
||||
explicit BlockBasedTablePropertiesCollector(
|
||||
BlockBasedTableOptions::IndexType index_type, bool whole_key_filtering,
|
||||
bool prefix_filtering)
|
||||
bool prefix_filtering, bool decoupled_partitioned_filters)
|
||||
: index_type_(index_type),
|
||||
whole_key_filtering_(whole_key_filtering),
|
||||
prefix_filtering_(prefix_filtering) {}
|
||||
prefix_filtering_(prefix_filtering),
|
||||
decoupled_partitioned_filters_(decoupled_partitioned_filters) {}
|
||||
|
||||
Status InternalAdd(const Slice& /*key*/, const Slice& /*value*/,
|
||||
uint64_t /*file_size*/) override {
|
||||
|
@ -240,6 +242,11 @@ class BlockBasedTableBuilder::BlockBasedTablePropertiesCollector
|
|||
whole_key_filtering_ ? kPropTrue : kPropFalse});
|
||||
properties->insert({BlockBasedTablePropertyNames::kPrefixFiltering,
|
||||
prefix_filtering_ ? kPropTrue : kPropFalse});
|
||||
if (decoupled_partitioned_filters_) {
|
||||
properties->insert(
|
||||
{BlockBasedTablePropertyNames::kDecoupledPartitionedFilters,
|
||||
kPropTrue});
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
|
@ -257,6 +264,7 @@ class BlockBasedTableBuilder::BlockBasedTablePropertiesCollector
|
|||
BlockBasedTableOptions::IndexType index_type_;
|
||||
bool whole_key_filtering_;
|
||||
bool prefix_filtering_;
|
||||
bool decoupled_partitioned_filters_;
|
||||
};
|
||||
|
||||
struct BlockBasedTableBuilder::Rep {
|
||||
|
@ -296,7 +304,7 @@ struct BlockBasedTableBuilder::Rep {
|
|||
std::string index_separator_scratch;
|
||||
PartitionedIndexBuilder* p_index_builder_ = nullptr;
|
||||
|
||||
std::string last_key;
|
||||
std::string last_ikey; // Internal key or empty (unset)
|
||||
const Slice* first_key_in_next_block = nullptr;
|
||||
CompressionType compression_type;
|
||||
uint64_t sample_for_compression;
|
||||
|
@ -594,7 +602,8 @@ struct BlockBasedTableBuilder::Rep {
|
|||
table_properties_collectors.emplace_back(
|
||||
new BlockBasedTablePropertiesCollector(
|
||||
table_options.index_type, table_options.whole_key_filtering,
|
||||
prefix_extractor != nullptr));
|
||||
prefix_extractor != nullptr,
|
||||
table_options.decouple_partitioned_filters));
|
||||
if (ts_sz > 0 && persist_user_defined_timestamps) {
|
||||
table_properties_collectors.emplace_back(
|
||||
new TimestampTablePropertiesCollector(
|
||||
|
@ -618,6 +627,9 @@ struct BlockBasedTableBuilder::Rep {
|
|||
if (!ReifyDbHostIdProperty(ioptions.env, &props.db_host_id).ok()) {
|
||||
ROCKS_LOG_INFO(ioptions.logger, "db_host_id property will not be set");
|
||||
}
|
||||
// Default is UINT64_MAX for unknown. Setting it to 0 here
|
||||
// to allow updating it by taking max in BlockBasedTableBuilder::Add().
|
||||
props.key_largest_seqno = 0;
|
||||
|
||||
if (FormatVersionUsesContextChecksum(table_options.format_version)) {
|
||||
// Must be non-zero and semi- or quasi-random
|
||||
|
@ -654,6 +666,7 @@ struct BlockBasedTableBuilder::Rep {
|
|||
};
|
||||
|
||||
struct BlockBasedTableBuilder::ParallelCompressionRep {
|
||||
// TODO: consider replacing with autovector or similar
|
||||
// Keys is a wrapper of vector of strings avoiding
|
||||
// releasing string memories during vector clear()
|
||||
// in order to save memory allocation overhead
|
||||
|
@ -998,24 +1011,27 @@ BlockBasedTableBuilder::~BlockBasedTableBuilder() {
|
|||
delete rep_;
|
||||
}
|
||||
|
||||
void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
|
||||
void BlockBasedTableBuilder::Add(const Slice& ikey, const Slice& value) {
|
||||
Rep* r = rep_;
|
||||
assert(rep_->state != Rep::State::kClosed);
|
||||
if (!ok()) {
|
||||
return;
|
||||
}
|
||||
ValueType value_type = ExtractValueType(key);
|
||||
ValueType value_type;
|
||||
SequenceNumber seq;
|
||||
UnPackSequenceAndType(ExtractInternalKeyFooter(ikey), &seq, &value_type);
|
||||
r->props.key_largest_seqno = std::max(r->props.key_largest_seqno, seq);
|
||||
if (IsValueType(value_type)) {
|
||||
#ifndef NDEBUG
|
||||
if (r->props.num_entries > r->props.num_range_deletions) {
|
||||
assert(r->internal_comparator.Compare(key, Slice(r->last_key)) > 0);
|
||||
assert(r->internal_comparator.Compare(ikey, Slice(r->last_ikey)) > 0);
|
||||
}
|
||||
#endif // !NDEBUG
|
||||
|
||||
auto should_flush = r->flush_block_policy->Update(key, value);
|
||||
auto should_flush = r->flush_block_policy->Update(ikey, value);
|
||||
if (should_flush) {
|
||||
assert(!r->data_block.empty());
|
||||
r->first_key_in_next_block = &key;
|
||||
r->first_key_in_next_block = &ikey;
|
||||
Flush();
|
||||
if (r->state == Rep::State::kBuffered) {
|
||||
bool exceeds_buffer_limit =
|
||||
|
@ -1050,7 +1066,8 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
|
|||
if (r->IsParallelCompressionEnabled()) {
|
||||
r->pc_rep->curr_block_keys->Clear();
|
||||
} else {
|
||||
r->index_builder->AddIndexEntry(r->last_key, &key, r->pending_handle,
|
||||
r->index_builder->AddIndexEntry(r->last_ikey, &ikey,
|
||||
r->pending_handle,
|
||||
&r->index_separator_scratch);
|
||||
}
|
||||
}
|
||||
|
@ -1060,27 +1077,31 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
|
|||
// builder after being added to index builder.
|
||||
if (r->state == Rep::State::kUnbuffered) {
|
||||
if (r->IsParallelCompressionEnabled()) {
|
||||
r->pc_rep->curr_block_keys->PushBack(key);
|
||||
r->pc_rep->curr_block_keys->PushBack(ikey);
|
||||
} else {
|
||||
if (r->filter_builder != nullptr) {
|
||||
r->filter_builder->Add(
|
||||
ExtractUserKeyAndStripTimestamp(key, r->ts_sz));
|
||||
r->filter_builder->AddWithPrevKey(
|
||||
ExtractUserKeyAndStripTimestamp(ikey, r->ts_sz),
|
||||
r->last_ikey.empty()
|
||||
? Slice{}
|
||||
: ExtractUserKeyAndStripTimestamp(r->last_ikey, r->ts_sz));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
r->data_block.AddWithLastKey(key, value, r->last_key);
|
||||
r->last_key.assign(key.data(), key.size());
|
||||
r->data_block.AddWithLastKey(ikey, value, r->last_ikey);
|
||||
r->last_ikey.assign(ikey.data(), ikey.size());
|
||||
assert(!r->last_ikey.empty());
|
||||
if (r->state == Rep::State::kBuffered) {
|
||||
// Buffered keys will be replayed from data_block_buffers during
|
||||
// `Finish()` once compression dictionary has been finalized.
|
||||
} else {
|
||||
if (!r->IsParallelCompressionEnabled()) {
|
||||
r->index_builder->OnKeyAdded(key);
|
||||
r->index_builder->OnKeyAdded(ikey);
|
||||
}
|
||||
}
|
||||
// TODO offset passed in is not accurate for parallel compression case
|
||||
NotifyCollectTableCollectorsOnAdd(key, value, r->get_offset(),
|
||||
NotifyCollectTableCollectorsOnAdd(ikey, value, r->get_offset(),
|
||||
r->table_properties_collectors,
|
||||
r->ioptions.logger);
|
||||
|
||||
|
@ -1094,9 +1115,9 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
|
|||
if (r->ts_sz > 0 && !r->persist_user_defined_timestamps) {
|
||||
persisted_end = StripTimestampFromUserKey(value, r->ts_sz);
|
||||
}
|
||||
r->range_del_block.Add(key, persisted_end);
|
||||
r->range_del_block.Add(ikey, persisted_end);
|
||||
// TODO offset passed in is not accurate for parallel compression case
|
||||
NotifyCollectTableCollectorsOnAdd(key, value, r->get_offset(),
|
||||
NotifyCollectTableCollectorsOnAdd(ikey, value, r->get_offset(),
|
||||
r->table_properties_collectors,
|
||||
r->ioptions.logger);
|
||||
} else {
|
||||
|
@ -1108,7 +1129,7 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
|
|||
}
|
||||
|
||||
r->props.num_entries++;
|
||||
r->props.raw_key_size += key.size();
|
||||
r->props.raw_key_size += ikey.size();
|
||||
if (!r->persist_user_defined_timestamps) {
|
||||
r->props.raw_key_size -= r->ts_sz;
|
||||
}
|
||||
|
@ -1452,6 +1473,8 @@ void BlockBasedTableBuilder::BGWorkWriteMaybeCompressedBlock() {
|
|||
Rep* r = rep_;
|
||||
ParallelCompressionRep::BlockRepSlot* slot = nullptr;
|
||||
ParallelCompressionRep::BlockRep* block_rep = nullptr;
|
||||
// Starts empty; see FilterBlockBuilder::AddWithPrevKey
|
||||
std::string prev_block_last_key_no_ts;
|
||||
while (r->pc_rep->write_queue.pop(slot)) {
|
||||
assert(slot != nullptr);
|
||||
slot->Take(block_rep);
|
||||
|
@ -1465,13 +1488,20 @@ void BlockBasedTableBuilder::BGWorkWriteMaybeCompressedBlock() {
|
|||
continue;
|
||||
}
|
||||
|
||||
Slice prev_key_no_ts = prev_block_last_key_no_ts;
|
||||
for (size_t i = 0; i < block_rep->keys->Size(); i++) {
|
||||
auto& key = (*block_rep->keys)[i];
|
||||
if (r->filter_builder != nullptr) {
|
||||
r->filter_builder->Add(ExtractUserKeyAndStripTimestamp(key, r->ts_sz));
|
||||
Slice key_no_ts = ExtractUserKeyAndStripTimestamp(key, r->ts_sz);
|
||||
r->filter_builder->AddWithPrevKey(key_no_ts, prev_key_no_ts);
|
||||
prev_key_no_ts = key_no_ts;
|
||||
}
|
||||
r->index_builder->OnKeyAdded(key);
|
||||
}
|
||||
if (r->filter_builder != nullptr) {
|
||||
prev_block_last_key_no_ts.assign(prev_key_no_ts.data(),
|
||||
prev_key_no_ts.size());
|
||||
}
|
||||
|
||||
r->pc_rep->file_size_estimator.SetCurrBlockUncompSize(
|
||||
block_rep->data->size());
|
||||
|
@ -1563,6 +1593,13 @@ void BlockBasedTableBuilder::WriteFilterBlock(
|
|||
// No filter block needed
|
||||
return;
|
||||
}
|
||||
if (!rep_->last_ikey.empty()) {
|
||||
// We might have been using AddWithPrevKey, so need PrevKeyBeforeFinish
|
||||
// to be safe. And because we are re-synchronized after buffered/parallel
|
||||
// operation, rep_->last_ikey is accurate.
|
||||
rep_->filter_builder->PrevKeyBeforeFinish(
|
||||
ExtractUserKeyAndStripTimestamp(rep_->last_ikey, rep_->ts_sz));
|
||||
}
|
||||
BlockHandle filter_block_handle;
|
||||
bool is_partitioned_filter = rep_->table_options.partition_filters;
|
||||
if (ok()) {
|
||||
|
@ -1578,9 +1615,10 @@ void BlockBasedTableBuilder::WriteFilterBlock(
|
|||
// See FilterBlockBuilder::Finish() for more on the difference in
|
||||
// transferred filter data payload among different FilterBlockBuilder
|
||||
// subtypes.
|
||||
std::unique_ptr<const char[]> filter_data;
|
||||
Slice filter_content =
|
||||
rep_->filter_builder->Finish(filter_block_handle, &s, &filter_data);
|
||||
std::unique_ptr<const char[]> filter_owner;
|
||||
Slice filter_content;
|
||||
s = rep_->filter_builder->Finish(filter_block_handle, &filter_content,
|
||||
&filter_owner);
|
||||
|
||||
assert(s.ok() || s.IsIncomplete() || s.IsCorruption());
|
||||
if (s.IsCorruption()) {
|
||||
|
@ -1749,6 +1787,7 @@ void BlockBasedTableBuilder::WritePropertiesBlock(
|
|||
rep_->props.user_defined_timestamps_persisted =
|
||||
rep_->persist_user_defined_timestamps;
|
||||
|
||||
assert(IsEmpty() || rep_->props.key_largest_seqno != UINT64_MAX);
|
||||
// Add basic properties
|
||||
property_block_builder.AddTableProperty(rep_->props);
|
||||
|
||||
|
@ -1976,6 +2015,10 @@ void BlockBasedTableBuilder::EnterUnbuffered() {
|
|||
for (; iter->Valid(); iter->Next()) {
|
||||
Slice key = iter->key();
|
||||
if (r->filter_builder != nullptr) {
|
||||
// NOTE: AddWithPrevKey here would only save key copying if prev is
|
||||
// pinned (iter->IsKeyPinned()), which is probably rare with delta
|
||||
// encoding. OK to go from Add() here to AddWithPrevKey() in
|
||||
// unbuffered operation.
|
||||
r->filter_builder->Add(
|
||||
ExtractUserKeyAndStripTimestamp(key, r->ts_sz));
|
||||
}
|
||||
|
@ -1989,6 +2032,7 @@ void BlockBasedTableBuilder::EnterUnbuffered() {
|
|||
Slice* first_key_in_next_block_ptr = &first_key_in_next_block;
|
||||
|
||||
iter->SeekToLast();
|
||||
assert(iter->Valid());
|
||||
r->index_builder->AddIndexEntry(
|
||||
iter->key(), first_key_in_next_block_ptr, r->pending_handle,
|
||||
&r->index_separator_scratch);
|
||||
|
@ -2027,7 +2071,7 @@ Status BlockBasedTableBuilder::Finish() {
|
|||
// block, we will finish writing all index entries first.
|
||||
if (ok() && !empty_data_block) {
|
||||
r->index_builder->AddIndexEntry(
|
||||
r->last_key, nullptr /* no next data block */, r->pending_handle,
|
||||
r->last_ikey, nullptr /* no next data block */, r->pending_handle,
|
||||
&r->index_separator_scratch);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -304,6 +304,10 @@ static std::unordered_map<std::string, OptionTypeInfo>
|
|||
{offsetof(struct BlockBasedTableOptions, partition_filters),
|
||||
OptionType::kBoolean, OptionVerificationType::kNormal,
|
||||
OptionTypeFlags::kNone}},
|
||||
{"decouple_partitioned_filters",
|
||||
{offsetof(struct BlockBasedTableOptions, decouple_partitioned_filters),
|
||||
OptionType::kBoolean, OptionVerificationType::kNormal,
|
||||
OptionTypeFlags::kNone}},
|
||||
{"optimize_filters_for_memory",
|
||||
{offsetof(struct BlockBasedTableOptions, optimize_filters_for_memory),
|
||||
OptionType::kBoolean, OptionVerificationType::kNormal,
|
||||
|
@ -971,6 +975,8 @@ const std::string BlockBasedTablePropertyNames::kWholeKeyFiltering =
|
|||
"rocksdb.block.based.table.whole.key.filtering";
|
||||
const std::string BlockBasedTablePropertyNames::kPrefixFiltering =
|
||||
"rocksdb.block.based.table.prefix.filtering";
|
||||
const std::string BlockBasedTablePropertyNames::kDecoupledPartitionedFilters =
|
||||
"rocksdb.block.based.table.decoupled.partitioned.filters";
|
||||
const std::string kHashIndexPrefixesBlock = "rocksdb.hashindex.prefixes";
|
||||
const std::string kHashIndexPrefixesMetadataBlock =
|
||||
"rocksdb.hashindex.metadata";
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue