mirror of https://github.com/facebook/rocksdb.git
Fix a race in ColumnFamilyData::UnrefAndTryDelete (#8605)
Summary: The `ColumnFamilyData::UnrefAndTryDelete` code currently on the trunk unlocks the DB mutex before destroying the `ThreadLocalPtr` holding the per-thread `SuperVersion` pointers when the only remaining reference is the back reference from `super_version_`. The idea behind this was to break the circular dependency between `ColumnFamilyData` and `SuperVersion`: when the penultimate reference goes away, `ColumnFamilyData` can clean up the `SuperVersion`, which can in turn clean up `ColumnFamilyData`. (Assuming there is a `SuperVersion` and it is not referenced by anything else.) However, unlocking the mutex throws a wrench in this plan by making it possible for another thread to jump in and take another reference to the `ColumnFamilyData`, keeping the object alive in a zombie `ThreadLocalPtr`-less state. This can cause issues like https://github.com/facebook/rocksdb/issues/8440 , https://github.com/facebook/rocksdb/issues/8382 , and might also explain the `was_last_ref` assertion failures from the `ColumnFamilySet` destructor we sometimes observe during close in our stress tests. Digging through the archives, this unlocking goes way back to 2014 (or earlier). The original rationale was that `SuperVersionUnrefHandle` used to lock the mutex so it can call `SuperVersion::Cleanup`; however, this logic turned out to be deadlock-prone. https://github.com/facebook/rocksdb/pull/3510 fixed the deadlock but left the unlocking in place. https://github.com/facebook/rocksdb/pull/6147 then introduced the circular dependency and associated cleanup logic described above (in order to enable iterators to keep the `ColumnFamilyData` for dropped column families alive), and moved the unlocking-relocking snippet to its present location in `UnrefAndTryDelete`. Finally, https://github.com/facebook/rocksdb/pull/7749 fixed a memory leak but apparently exacerbated the race by (otherwise correctly) switching to `UnrefAndTryDelete` in `SuperVersion::Cleanup`. The patch simply eliminates the unlocking and relocking, which has been unnecessary ever since https://github.com/facebook/rocksdb/issues/3510 made `SuperVersionUnrefHandle` lock-free. This closes the window during which another thread could increase the reference count, and hopefully fixes the issues above. Pull Request resolved: https://github.com/facebook/rocksdb/pull/8605 Test Plan: Ran `make check` and stress tests locally. Reviewed By: pdillinger Differential Revision: D30051035 Pulled By: ltamasi fbshipit-source-id: 8fe559e4b4ad69fc142579f8bc393ef525918528
This commit is contained in:
parent
8e91bd90d2
commit
3f7e929865
|
@ -457,7 +457,7 @@ void SuperVersion::Cleanup() {
|
||||||
to_delete.push_back(m);
|
to_delete.push_back(m);
|
||||||
}
|
}
|
||||||
current->Unref();
|
current->Unref();
|
||||||
cfd->UnrefAndTryDelete(this);
|
cfd->UnrefAndTryDelete();
|
||||||
}
|
}
|
||||||
|
|
||||||
void SuperVersion::Init(ColumnFamilyData* new_cfd, MemTable* new_mem,
|
void SuperVersion::Init(ColumnFamilyData* new_cfd, MemTable* new_mem,
|
||||||
|
@ -475,10 +475,10 @@ void SuperVersion::Init(ColumnFamilyData* new_cfd, MemTable* new_mem,
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
void SuperVersionUnrefHandle(void* ptr) {
|
void SuperVersionUnrefHandle(void* ptr) {
|
||||||
// UnrefHandle is called when a thread exists or a ThreadLocalPtr gets
|
// UnrefHandle is called when a thread exits or a ThreadLocalPtr gets
|
||||||
// destroyed. When former happens, the thread shouldn't see kSVInUse.
|
// destroyed. When the former happens, the thread shouldn't see kSVInUse.
|
||||||
// When latter happens, we are in ~ColumnFamilyData(), no get should happen as
|
// When the latter happens, only super_version_ holds a reference
|
||||||
// well.
|
// to ColumnFamilyData, so no further queries are possible.
|
||||||
SuperVersion* sv = static_cast<SuperVersion*>(ptr);
|
SuperVersion* sv = static_cast<SuperVersion*>(ptr);
|
||||||
bool was_last_ref __attribute__((__unused__));
|
bool was_last_ref __attribute__((__unused__));
|
||||||
was_last_ref = sv->Unref();
|
was_last_ref = sv->Unref();
|
||||||
|
@ -668,7 +668,7 @@ ColumnFamilyData::~ColumnFamilyData() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ColumnFamilyData::UnrefAndTryDelete(SuperVersion* sv_under_cleanup) {
|
bool ColumnFamilyData::UnrefAndTryDelete() {
|
||||||
int old_refs = refs_.fetch_sub(1);
|
int old_refs = refs_.fetch_sub(1);
|
||||||
assert(old_refs > 0);
|
assert(old_refs > 0);
|
||||||
|
|
||||||
|
@ -678,22 +678,17 @@ bool ColumnFamilyData::UnrefAndTryDelete(SuperVersion* sv_under_cleanup) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// If called under SuperVersion::Cleanup, we should not re-enter Cleanup on
|
if (old_refs == 2 && super_version_ != nullptr) {
|
||||||
// the same SuperVersion. (But while installing a new SuperVersion, this
|
|
||||||
// cfd could be referenced only by two SuperVersions.)
|
|
||||||
if (old_refs == 2 && super_version_ != nullptr &&
|
|
||||||
super_version_ != sv_under_cleanup) {
|
|
||||||
// Only the super_version_ holds me
|
// Only the super_version_ holds me
|
||||||
SuperVersion* sv = super_version_;
|
SuperVersion* sv = super_version_;
|
||||||
super_version_ = nullptr;
|
super_version_ = nullptr;
|
||||||
// Release SuperVersion reference kept in ThreadLocalPtr.
|
|
||||||
// This must be done outside of mutex_ since unref handler can lock mutex.
|
// Release SuperVersion references kept in ThreadLocalPtr.
|
||||||
sv->db_mutex->Unlock();
|
|
||||||
local_sv_.reset();
|
local_sv_.reset();
|
||||||
sv->db_mutex->Lock();
|
|
||||||
|
|
||||||
if (sv->Unref()) {
|
if (sv->Unref()) {
|
||||||
// May delete this ColumnFamilyData after calling Cleanup()
|
// Note: sv will delete this ColumnFamilyData during Cleanup()
|
||||||
|
assert(sv->cfd == this);
|
||||||
sv->Cleanup();
|
sv->Cleanup();
|
||||||
delete sv;
|
delete sv;
|
||||||
return true;
|
return true;
|
||||||
|
@ -1258,14 +1253,13 @@ bool ColumnFamilyData::ReturnThreadLocalSuperVersion(SuperVersion* sv) {
|
||||||
void ColumnFamilyData::InstallSuperVersion(
|
void ColumnFamilyData::InstallSuperVersion(
|
||||||
SuperVersionContext* sv_context, InstrumentedMutex* db_mutex) {
|
SuperVersionContext* sv_context, InstrumentedMutex* db_mutex) {
|
||||||
db_mutex->AssertHeld();
|
db_mutex->AssertHeld();
|
||||||
return InstallSuperVersion(sv_context, db_mutex, mutable_cf_options_);
|
return InstallSuperVersion(sv_context, mutable_cf_options_);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ColumnFamilyData::InstallSuperVersion(
|
void ColumnFamilyData::InstallSuperVersion(
|
||||||
SuperVersionContext* sv_context, InstrumentedMutex* db_mutex,
|
SuperVersionContext* sv_context,
|
||||||
const MutableCFOptions& mutable_cf_options) {
|
const MutableCFOptions& mutable_cf_options) {
|
||||||
SuperVersion* new_superversion = sv_context->new_superversion.release();
|
SuperVersion* new_superversion = sv_context->new_superversion.release();
|
||||||
new_superversion->db_mutex = db_mutex;
|
|
||||||
new_superversion->mutable_cf_options = mutable_cf_options;
|
new_superversion->mutable_cf_options = mutable_cf_options;
|
||||||
new_superversion->Init(this, mem_, imm_.current(), current_);
|
new_superversion->Init(this, mem_, imm_.current(), current_);
|
||||||
SuperVersion* old_superversion = super_version_;
|
SuperVersion* old_superversion = super_version_;
|
||||||
|
|
|
@ -208,8 +208,6 @@ struct SuperVersion {
|
||||||
uint64_t version_number;
|
uint64_t version_number;
|
||||||
WriteStallCondition write_stall_condition;
|
WriteStallCondition write_stall_condition;
|
||||||
|
|
||||||
InstrumentedMutex* db_mutex;
|
|
||||||
|
|
||||||
// should be called outside the mutex
|
// should be called outside the mutex
|
||||||
SuperVersion() = default;
|
SuperVersion() = default;
|
||||||
~SuperVersion();
|
~SuperVersion();
|
||||||
|
@ -281,8 +279,7 @@ class ColumnFamilyData {
|
||||||
// UnrefAndTryDelete() decreases the reference count and do free if needed,
|
// UnrefAndTryDelete() decreases the reference count and do free if needed,
|
||||||
// return true if this is freed else false, UnrefAndTryDelete() can only
|
// return true if this is freed else false, UnrefAndTryDelete() can only
|
||||||
// be called while holding a DB mutex, or during single-threaded recovery.
|
// be called while holding a DB mutex, or during single-threaded recovery.
|
||||||
// sv_under_cleanup is only provided when called from SuperVersion::Cleanup.
|
bool UnrefAndTryDelete();
|
||||||
bool UnrefAndTryDelete(SuperVersion* sv_under_cleanup = nullptr);
|
|
||||||
|
|
||||||
// SetDropped() can only be called under following conditions:
|
// SetDropped() can only be called under following conditions:
|
||||||
// 1) Holding a DB mutex,
|
// 1) Holding a DB mutex,
|
||||||
|
@ -453,7 +450,6 @@ class ColumnFamilyData {
|
||||||
// the clients to allocate SuperVersion outside of mutex.
|
// the clients to allocate SuperVersion outside of mutex.
|
||||||
// IMPORTANT: Only call this from DBImpl::InstallSuperVersion()
|
// IMPORTANT: Only call this from DBImpl::InstallSuperVersion()
|
||||||
void InstallSuperVersion(SuperVersionContext* sv_context,
|
void InstallSuperVersion(SuperVersionContext* sv_context,
|
||||||
InstrumentedMutex* db_mutex,
|
|
||||||
const MutableCFOptions& mutable_cf_options);
|
const MutableCFOptions& mutable_cf_options);
|
||||||
void InstallSuperVersion(SuperVersionContext* sv_context,
|
void InstallSuperVersion(SuperVersionContext* sv_context,
|
||||||
InstrumentedMutex* db_mutex);
|
InstrumentedMutex* db_mutex);
|
||||||
|
|
|
@ -3469,7 +3469,7 @@ void DBImpl::InstallSuperVersionAndScheduleWork(
|
||||||
if (UNLIKELY(sv_context->new_superversion == nullptr)) {
|
if (UNLIKELY(sv_context->new_superversion == nullptr)) {
|
||||||
sv_context->NewSuperVersion();
|
sv_context->NewSuperVersion();
|
||||||
}
|
}
|
||||||
cfd->InstallSuperVersion(sv_context, &mutex_, mutable_cf_options);
|
cfd->InstallSuperVersion(sv_context, mutable_cf_options);
|
||||||
|
|
||||||
// There may be a small data race here. The snapshot tricking bottommost
|
// There may be a small data race here. The snapshot tricking bottommost
|
||||||
// compaction may already be released here. But assuming there will always be
|
// compaction may already be released here. But assuming there will always be
|
||||||
|
|
Loading…
Reference in New Issue