Fix `needs_flush` assertion in file ingestion (#13045)

Summary:
This PR makes file ingestion job's flush wait a bit further until the SuperVersion is also updated. This is necessary since follow up operations will use the current SuperVersion to do range overlapping check and level assignment.

In debug mode, file ingestion job's second `NeedsFlush` call could have been invoked when the memtables are flushed but the SuperVersion hasn't been updated yet, triggering the assertion.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13045

Test Plan:
Existing tests
Manually stress tested

Reviewed By: cbi42

Differential Revision: D63671151

Pulled By: jowlyzhang

fbshipit-source-id: 95a169e58a7e59f6dd4125e7296e9060fe4c63a7
This commit is contained in:
Yu Zhang 2024-10-02 17:19:18 -07:00 committed by Facebook GitHub Bot
parent dd23e84cad
commit 9375c3b635
6 changed files with 58 additions and 16 deletions

View File

@ -385,9 +385,9 @@ class ColumnFamilyData {
uint64_t GetTotalSstFilesSize() const; // REQUIRE: DB mutex held
uint64_t GetLiveSstFilesSize() const; // REQUIRE: DB mutex held
uint64_t GetTotalBlobFileSize() const; // REQUIRE: DB mutex held
// REQUIRE: DB mutex held
void SetMemtable(MemTable* new_mem) {
uint64_t memtable_id = last_memtable_id_.fetch_add(1) + 1;
new_mem->SetID(memtable_id);
new_mem->SetID(++last_memtable_id_);
mem_ = new_mem;
}
@ -669,7 +669,7 @@ class ColumnFamilyData {
bool allow_2pc_;
// Memtable id to track flush.
std::atomic<uint64_t> last_memtable_id_;
uint64_t last_memtable_id_;
// Directories corresponding to cf_paths.
std::vector<std::shared_ptr<FSDirectory>> data_dirs_;

View File

@ -2076,17 +2076,18 @@ class DBImpl : public DB {
// memtable pending flush.
// resuming_from_bg_err indicates whether the caller is attempting to resume
// from background error.
Status WaitForFlushMemTable(ColumnFamilyData* cfd,
const uint64_t* flush_memtable_id = nullptr,
bool resuming_from_bg_err = false) {
Status WaitForFlushMemTable(
ColumnFamilyData* cfd, const uint64_t* flush_memtable_id = nullptr,
bool resuming_from_bg_err = false,
std::optional<FlushReason> flush_reason = std::nullopt) {
return WaitForFlushMemTables({cfd}, {flush_memtable_id},
resuming_from_bg_err);
resuming_from_bg_err, flush_reason);
}
// Wait for memtables to be flushed for multiple column families.
Status WaitForFlushMemTables(
const autovector<ColumnFamilyData*>& cfds,
const autovector<const uint64_t*>& flush_memtable_ids,
bool resuming_from_bg_err);
bool resuming_from_bg_err, std::optional<FlushReason> flush_reason);
inline void WaitForPendingWrites() {
mutex_.AssertHeld();

View File

@ -2407,7 +2407,8 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
}
s = WaitForFlushMemTables(
cfds, flush_memtable_ids,
flush_reason == FlushReason::kErrorRecovery /* resuming_from_bg_err */);
flush_reason == FlushReason::kErrorRecovery /* resuming_from_bg_err */,
flush_reason);
InstrumentedMutexLock lock_guard(&mutex_);
for (auto* tmp_cfd : cfds) {
tmp_cfd->UnrefAndTryDelete();
@ -2549,7 +2550,8 @@ Status DBImpl::AtomicFlushMemTables(
}
s = WaitForFlushMemTables(
cfds, flush_memtable_ids,
flush_reason == FlushReason::kErrorRecovery /* resuming_from_bg_err */);
flush_reason == FlushReason::kErrorRecovery /* resuming_from_bg_err */,
flush_reason);
InstrumentedMutexLock lock_guard(&mutex_);
for (auto* cfd : cfds) {
cfd->UnrefAndTryDelete();
@ -2612,7 +2614,7 @@ Status DBImpl::RetryFlushesForErrorRecovery(FlushReason flush_reason,
flush_memtable_id_ptrs.push_back(&flush_memtable_id);
}
s = WaitForFlushMemTables(cfds, flush_memtable_id_ptrs,
true /* resuming_from_bg_err */);
true /* resuming_from_bg_err */, flush_reason);
mutex_.Lock();
}
@ -2712,7 +2714,7 @@ Status DBImpl::WaitUntilFlushWouldNotStallWrites(ColumnFamilyData* cfd,
Status DBImpl::WaitForFlushMemTables(
const autovector<ColumnFamilyData*>& cfds,
const autovector<const uint64_t*>& flush_memtable_ids,
bool resuming_from_bg_err) {
bool resuming_from_bg_err, std::optional<FlushReason> flush_reason) {
int num = static_cast<int>(cfds.size());
// Wait until the compaction completes
InstrumentedMutexLock l(&mutex_);
@ -2750,9 +2752,17 @@ Status DBImpl::WaitForFlushMemTables(
(flush_memtable_ids[i] != nullptr &&
cfds[i]->imm()->GetEarliestMemTableID() >
*flush_memtable_ids[i])) {
// Make file ingestion's flush wait until SuperVersion is also updated
// since after flush, it does range overlapping check and file level
// assignment with the current SuperVersion.
if (!flush_reason.has_value() ||
flush_reason.value() != FlushReason::kExternalFileIngestion ||
cfds[i]->GetSuperVersion()->imm->GetID() ==
cfds[i]->imm()->current()->GetID()) {
++num_finished;
}
}
}
if (1 == num_dropped && 1 == num) {
s = Status::ColumnFamilyDropped();
return s;

View File

@ -359,11 +359,15 @@ bool MemTableListVersion::MemtableLimitExceeded(size_t usage) {
}
}
bool MemTableListVersion::HistoryShouldBeTrimmed(size_t usage) {
return MemtableLimitExceeded(usage) && !memlist_history_.empty();
}
// Make sure we don't use up too much space in history
bool MemTableListVersion::TrimHistory(autovector<MemTable*>* to_delete,
size_t usage) {
bool ret = false;
while (MemtableLimitExceeded(usage) && !memlist_history_.empty()) {
while (HistoryShouldBeTrimmed(usage)) {
MemTable* x = memlist_history_.back();
memlist_history_.pop_back();
@ -661,8 +665,16 @@ void MemTableList::Add(MemTable* m, autovector<MemTable*>* to_delete) {
}
bool MemTableList::TrimHistory(autovector<MemTable*>* to_delete, size_t usage) {
// Check if history trim is needed first, so that we can avoid installing a
// new MemTableListVersion without installing a SuperVersion (installed based
// on return value of this function).
if (!current_->HistoryShouldBeTrimmed(usage)) {
ResetTrimHistoryNeeded();
return false;
}
InstallNewVersion();
bool ret = current_->TrimHistory(to_delete, usage);
assert(ret);
UpdateCachedValuesFromMemTableListVersion();
ResetTrimHistoryNeeded();
return ret;
@ -714,6 +726,7 @@ void MemTableList::InstallNewVersion() {
// somebody else holds the current version, we need to create new one
MemTableListVersion* version = current_;
current_ = new MemTableListVersion(&current_memory_usage_, *version);
current_->SetID(++last_memtable_list_version_id_);
current_->Ref();
version->Unref();
}

View File

@ -141,6 +141,11 @@ class MemTableListVersion {
// Return kMaxSequenceNumber if the list is empty.
SequenceNumber GetFirstSequenceNumber() const;
// REQUIRES: db_mutex held.
void SetID(uint64_t id) { id_ = id; }
uint64_t GetID() const { return id_; }
private:
friend class MemTableList;
@ -161,7 +166,11 @@ class MemTableListVersion {
// REQUIRE: m is an immutable memtable
void Remove(MemTable* m, autovector<MemTable*>* to_delete);
// Return true if memtable is trimmed
// Return true if the memtable list should be trimmed to get memory usage
// under budget.
bool HistoryShouldBeTrimmed(size_t usage);
// Trim history, Return true if memtable is trimmed
bool TrimHistory(autovector<MemTable*>* to_delete, size_t usage);
bool GetFromList(std::list<MemTable*>* list, const LookupKey& key,
@ -205,6 +214,9 @@ class MemTableListVersion {
int refs_ = 0;
size_t* parent_memtable_list_memory_usage_;
// MemtableListVersion id to track for flush results checking.
uint64_t id_ = 0;
};
// This class stores references to all the immutable memtables.
@ -235,7 +247,8 @@ class MemTableList {
flush_requested_(false),
current_memory_usage_(0),
current_memory_allocted_bytes_excluding_last_(0),
current_has_history_(false) {
current_has_history_(false),
last_memtable_list_version_id_(0) {
current_->Ref();
}
@ -500,6 +513,10 @@ class MemTableList {
// Cached value of current_->HasHistory().
std::atomic<bool> current_has_history_;
// Last memtabe list version id, increase by 1 each time a new
// MemtableListVersion is installed.
uint64_t last_memtable_list_version_id_;
};
// Installs memtable atomic flush results.

View File

@ -2037,6 +2037,7 @@ struct FlushOptions {
// is performed by someone else (foreground call or background thread).
// Default: false
bool allow_write_stall;
FlushOptions() : wait(true), allow_write_stall(false) {}
};