From 8c79f79208b82d2ae5be09cb7ae11356884350ac Mon Sep 17 00:00:00 2001 From: DorianZheng Date: Thu, 3 Jan 2019 12:36:43 -0800 Subject: [PATCH] Fix skip WAL for whole write_group when leader's callback fail (#4838) Summary: The original implementation has two problems: 1. https://github.com/facebook/rocksdb/blob/f0dda35d7de1fd56e0b7c96376ca8aff2a6364fd/db/db_impl_write.cc#L478 https://github.com/facebook/rocksdb/blob/f0dda35d7de1fd56e0b7c96376ca8aff2a6364fd/db/write_thread.h#L231 If the callback status of leader of the write_group fails, then the whole write_group will not write to WAL, this may cause data loss. 2. https://github.com/facebook/rocksdb/blob/f0dda35d7de1fd56e0b7c96376ca8aff2a6364fd/db/write_thread.h#L130 The annotation says that Writer.status is the status of memtable inserter, but the original implementation use it for another case which is not consistent with the original design. Looks like we can still reuse Writer.status, but we should modify the annotation, so Writer.status is not only the status of memtable inserter. Pull Request resolved: https://github.com/facebook/rocksdb/pull/4838 Differential Revision: D13574070 Pulled By: yiwu-arbug fbshipit-source-id: a2a2aefcfd329c4c6a91652bf090aaf1ce119c4b --- HISTORY.md | 1 + db/db_impl_write.cc | 4 ++-- db/write_thread.h | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 113381d8c0..bc40fac4f5 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -14,6 +14,7 @@ * Fix a memory leak when files with range tombstones are read in mmap mode and block cache is enabled * Fix handling of corrupt range tombstone blocks such that corruptions cannot cause deleted keys to reappear * Lock free MultiGet +* Fix with pipelined write, write leaders's callback failure lead to the whole write group fail. ## 5.18.0 (11/30/2018) ### New Features diff --git a/db/db_impl_write.cc b/db/db_impl_write.cc index 3a239ee5cd..885832f5a6 100644 --- a/db/db_impl_write.cc +++ b/db/db_impl_write.cc @@ -475,7 +475,7 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options, PERF_TIMER_STOP(write_pre_and_post_process_time); - if (w.ShouldWriteToWAL()) { + if (w.status.ok() && !write_options.disableWAL) { PERF_TIMER_GUARD(write_wal_time); stats->AddDBStats(InternalStats::WRITE_DONE_BY_SELF, 1); RecordTick(stats_, WRITE_DONE_BY_SELF, 1); @@ -504,7 +504,7 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options, WriteThread::WriteGroup memtable_write_group; if (w.state == WriteThread::STATE_MEMTABLE_WRITER_LEADER) { PERF_TIMER_GUARD(write_memtable_time); - assert(w.status.ok()); + assert(w.ShouldWriteToMemtable()); write_thread_.EnterAsMemTableWriter(&w, &memtable_write_group); if (memtable_write_group.size > 1 && immutable_db_options_.allow_concurrent_memtable_write) { diff --git a/db/write_thread.h b/db/write_thread.h index a3802c996b..dc9c22ff87 100644 --- a/db/write_thread.h +++ b/db/write_thread.h @@ -127,7 +127,7 @@ class WriteThread { std::atomic state; // write under StateMutex() or pre-link WriteGroup* write_group; SequenceNumber sequence; // the sequence number to use for the first key - Status status; // status of memtable inserter + Status status; Status callback_status; // status returned by callback->Callback() std::aligned_storage::type state_mutex_bytes;