mirror of https://github.com/facebook/rocksdb.git
Call PreReleaseCallback between WAL and memtable write (#5015)
Summary: PreReleaseCallback meant to be called before the writes are visible to the readers. Since the sequence number is known after the WAL write, there is no reason to delay calling PreReleaseCallback to after the memtable write, which would complicates the reader's logic in presence of our memtable writes that are made visible by the other write thread. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5015 Differential Revision: D14221670 Pulled By: maysamyabandeh fbshipit-source-id: a504dd665cf923226d7af09cc8e9c7739a25edc6
This commit is contained in:
parent
68a2f94d5d
commit
77ebc82b92
|
@ -146,17 +146,6 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
|
|||
|
||||
if (write_thread_.CompleteParallelMemTableWriter(&w)) {
|
||||
// we're responsible for exit batch group
|
||||
for (auto* writer : *(w.write_group)) {
|
||||
if (!writer->CallbackFailed() && writer->pre_release_callback) {
|
||||
assert(writer->sequence != kMaxSequenceNumber);
|
||||
Status ws = writer->pre_release_callback->Callback(writer->sequence,
|
||||
disable_memtable);
|
||||
if (!ws.ok()) {
|
||||
status = ws;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
// TODO(myabandeh): propagate status to write_group
|
||||
auto last_sequence = w.write_group->last_sequence;
|
||||
versions_->SetLastSequence(last_sequence);
|
||||
|
@ -309,6 +298,35 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
|
|||
const SequenceNumber current_sequence = last_sequence + 1;
|
||||
last_sequence += seq_inc;
|
||||
|
||||
// PreReleaseCallback is called after WAL write and before memtable write
|
||||
if (status.ok()) {
|
||||
SequenceNumber next_sequence = current_sequence;
|
||||
// Note: the logic for advancing seq here must be consistent with the
|
||||
// logic in WriteBatchInternal::InsertInto(write_group...) as well as
|
||||
// with WriteBatchInternal::InsertInto(write_batch...) that is called on
|
||||
// the merged batch during recovery from the WAL.
|
||||
for (auto* writer : write_group) {
|
||||
if (writer->CallbackFailed()) {
|
||||
continue;
|
||||
}
|
||||
writer->sequence = next_sequence;
|
||||
if (writer->pre_release_callback) {
|
||||
Status ws = writer->pre_release_callback->Callback(writer->sequence,
|
||||
disable_memtable);
|
||||
if (!ws.ok()) {
|
||||
status = ws;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (seq_per_batch_) {
|
||||
assert(writer->batch_cnt);
|
||||
next_sequence += writer->batch_cnt;
|
||||
} else if (writer->ShouldWriteToMemtable()) {
|
||||
next_sequence += WriteBatchInternal::Count(writer->batch);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (status.ok()) {
|
||||
PERF_TIMER_GUARD(write_memtable_time);
|
||||
|
||||
|
@ -320,23 +338,6 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
|
|||
0 /*recovery_log_number*/, this, parallel, seq_per_batch_,
|
||||
batch_per_txn_);
|
||||
} else {
|
||||
SequenceNumber next_sequence = current_sequence;
|
||||
// Note: the logic for advancing seq here must be consistent with the
|
||||
// logic in WriteBatchInternal::InsertInto(write_group...) as well as
|
||||
// with WriteBatchInternal::InsertInto(write_batch...) that is called on
|
||||
// the merged batch during recovery from the WAL.
|
||||
for (auto* writer : write_group) {
|
||||
if (writer->CallbackFailed()) {
|
||||
continue;
|
||||
}
|
||||
writer->sequence = next_sequence;
|
||||
if (seq_per_batch_) {
|
||||
assert(writer->batch_cnt);
|
||||
next_sequence += writer->batch_cnt;
|
||||
} else if (writer->ShouldWriteToMemtable()) {
|
||||
next_sequence += WriteBatchInternal::Count(writer->batch);
|
||||
}
|
||||
}
|
||||
write_group.last_sequence = last_sequence;
|
||||
write_thread_.LaunchParallelMemTableWriters(&write_group);
|
||||
in_parallel_group = true;
|
||||
|
@ -388,17 +389,8 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
|
|||
}
|
||||
if (should_exit_batch_group) {
|
||||
if (status.ok()) {
|
||||
for (auto* writer : write_group) {
|
||||
if (!writer->CallbackFailed() && writer->pre_release_callback) {
|
||||
assert(writer->sequence != kMaxSequenceNumber);
|
||||
Status ws = writer->pre_release_callback->Callback(writer->sequence,
|
||||
disable_memtable);
|
||||
if (!ws.ok()) {
|
||||
status = ws;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Note: if we are to resume after non-OK statuses we need to revisit how
|
||||
// we reacts to non-OK statuses here.
|
||||
versions_->SetLastSequence(last_sequence);
|
||||
}
|
||||
MemTableInsertStatusCheck(w.status);
|
||||
|
|
|
@ -15,8 +15,8 @@ class PreReleaseCallback {
|
|||
public:
|
||||
virtual ~PreReleaseCallback() {}
|
||||
|
||||
// Will be called while on the write thread after the write and before the
|
||||
// release of the sequence number. This is useful if any operation needs to be
|
||||
// Will be called while on the write thread after the write to the WAL and
|
||||
// before the write to memtable. This is useful if any operation needs to be
|
||||
// done before the write gets visible to the readers, or if we want to reduce
|
||||
// the overhead of locking by updating something sequentially while we are on
|
||||
// the write thread. If the callback fails, this function returns a non-OK
|
||||
|
|
Loading…
Reference in New Issue