mirror of https://github.com/facebook/rocksdb.git
Support read & write with unsynced data in FaultInjectionTestFS (#12852)
Summary: Follow-up to https://github.com/facebook/rocksdb/issues/12729 and others to fix FaultInjectionTestFS handling the case where a live WAL is being appended to and synced while also being copied for checkpoint or backup, up to a known flushed (but not necessarily synced) prefix of the file. It was tricky to structure the code in a way that could handle a tricky race with Sync in another thread (see code comments, thanks Changyu) while maintaining good performance and test-ability. For more context, see the call to FlushWAL() in DBImpl::GetLiveFilesStorageInfo(). Also, the unit test for https://github.com/facebook/rocksdb/issues/12729 was neutered by https://github.com/facebook/rocksdb/issues/12797, and this re-enables the functionality it is testing. Pull Request resolved: https://github.com/facebook/rocksdb/pull/12852 Test Plan: unit test expanded/updated. Local runs of blackbox_crash_test. The implementation is structured so that a multi-threaded unit test is not needed to cover at least the code lines, as the race handling is folded into "catch up after returning unsynced and then a sync." Reviewed By: cbi42 Differential Revision: D59594045 Pulled By: pdillinger fbshipit-source-id: 94667bb72255e2952586c53bae2c2dd384e85a50
This commit is contained in:
parent
3db030d7ee
commit
72438a6788
|
@ -567,49 +567,78 @@ class FaultInjectionDBTest : public DBTestBase {
|
|||
TEST(FaultInjectionFSTest, ReadUnsyncedData) {
|
||||
std::shared_ptr<FaultInjectionTestFS> fault_fs =
|
||||
std::make_shared<FaultInjectionTestFS>(FileSystem::Default());
|
||||
fault_fs->SetInjectUnsyncedDataLoss(true);
|
||||
ASSERT_TRUE(fault_fs->ReadUnsyncedData());
|
||||
ASSERT_TRUE(fault_fs->InjectUnsyncedDataLoss());
|
||||
|
||||
// This is a randomized mini-stress test, to reduce the chances of bugs in
|
||||
// FaultInjectionTestFS being caught only in db_stress, where they are
|
||||
// difficult to debug.
|
||||
uint32_t len = Random::GetTLSInstance()->Uniform(10000) + 1;
|
||||
Random rnd(len);
|
||||
|
||||
// Create partially synced file
|
||||
std::string f = test::PerThreadDBPath("read_unsynced.data");
|
||||
std::string data = rnd.RandomString(len);
|
||||
{
|
||||
std::unique_ptr<FSWritableFile> w;
|
||||
ASSERT_OK(fault_fs->NewWritableFile(f, {}, &w, nullptr));
|
||||
uint32_t synced_len = rnd.Uniform(len + 1);
|
||||
ASSERT_OK(w->Append(Slice(data.data(), synced_len), {}, nullptr));
|
||||
|
||||
// Create partially synced file
|
||||
std::unique_ptr<FSWritableFile> w;
|
||||
ASSERT_OK(fault_fs->NewWritableFile(f, {}, &w, nullptr));
|
||||
uint32_t synced_len = rnd.Uniform(len + 1);
|
||||
ASSERT_OK(w->Append(Slice(data.data(), synced_len), {}, nullptr));
|
||||
if (synced_len > 0) {
|
||||
ASSERT_OK(w->Sync({}, nullptr));
|
||||
ASSERT_OK(w->Append(Slice(data.data() + synced_len, len - synced_len), {},
|
||||
nullptr));
|
||||
// no sync here
|
||||
ASSERT_OK(w->Close({}, nullptr));
|
||||
}
|
||||
ASSERT_OK(w->Append(Slice(data.data() + synced_len, len - synced_len), {},
|
||||
nullptr));
|
||||
|
||||
// Test file size includes unsynced data
|
||||
{
|
||||
uint64_t file_size;
|
||||
ASSERT_OK(fault_fs->GetFileSize(f, {}, &file_size, nullptr));
|
||||
ASSERT_EQ(len, file_size);
|
||||
}
|
||||
|
||||
// Test read file contents, with two reads that probably don't
|
||||
// align with the unsynced split.
|
||||
{
|
||||
std::unique_ptr<FSSequentialFile> r;
|
||||
ASSERT_OK(fault_fs->NewSequentialFile(f, {}, &r, nullptr));
|
||||
uint32_t first_read_len = rnd.Uniform(len + 1);
|
||||
Slice sl;
|
||||
std::unique_ptr<char[]> scratch(new char[first_read_len]);
|
||||
ASSERT_OK(r->Read(first_read_len, {}, &sl, scratch.get(), nullptr));
|
||||
ASSERT_EQ(first_read_len, sl.size());
|
||||
ASSERT_EQ(0, sl.compare(Slice(data.data(), first_read_len)));
|
||||
uint32_t second_read_len = len - first_read_len;
|
||||
scratch.reset(new char[second_read_len]);
|
||||
ASSERT_OK(r->Read(second_read_len, {}, &sl, scratch.get(), nullptr));
|
||||
ASSERT_EQ(second_read_len, sl.size());
|
||||
ASSERT_EQ(0,
|
||||
sl.compare(Slice(data.data() + first_read_len, second_read_len)));
|
||||
// align with the unsynced split. And maybe a sync or write between
|
||||
// the two reads.
|
||||
std::unique_ptr<FSSequentialFile> r;
|
||||
ASSERT_OK(fault_fs->NewSequentialFile(f, {}, &r, nullptr));
|
||||
uint32_t first_read_len = rnd.Uniform(len + 1);
|
||||
Slice sl;
|
||||
std::unique_ptr<char[]> scratch(new char[first_read_len]);
|
||||
ASSERT_OK(r->Read(first_read_len, {}, &sl, scratch.get(), nullptr));
|
||||
ASSERT_EQ(first_read_len, sl.size());
|
||||
ASSERT_EQ(0, sl.compare(Slice(data.data(), first_read_len)));
|
||||
|
||||
// Maybe a sync and/or write and/or close between the two reads.
|
||||
if (rnd.OneIn(2)) {
|
||||
ASSERT_OK(w->Sync({}, nullptr));
|
||||
}
|
||||
if (rnd.OneIn(2)) {
|
||||
uint32_t more_len = rnd.Uniform(1000) + 1;
|
||||
std::string more_data = rnd.RandomString(more_len);
|
||||
ASSERT_OK(w->Append(more_data, {}, nullptr));
|
||||
data += more_data;
|
||||
len += more_len;
|
||||
}
|
||||
if (rnd.OneIn(2)) {
|
||||
ASSERT_OK(w->Sync({}, nullptr));
|
||||
}
|
||||
if (rnd.OneIn(2)) {
|
||||
ASSERT_OK(w->Close({}, nullptr));
|
||||
w.reset();
|
||||
}
|
||||
|
||||
// Second read some of, all of, or more than rest of file
|
||||
uint32_t second_read_len = rnd.Uniform(len + 1);
|
||||
scratch.reset(new char[second_read_len]);
|
||||
ASSERT_OK(r->Read(second_read_len, {}, &sl, scratch.get(), nullptr));
|
||||
if (len - first_read_len < second_read_len) {
|
||||
ASSERT_EQ(len - first_read_len, sl.size());
|
||||
} else {
|
||||
ASSERT_EQ(second_read_len, sl.size());
|
||||
}
|
||||
ASSERT_EQ(0, sl.compare(Slice(data.data() + first_read_len, sl.size())));
|
||||
}
|
||||
} // namespace ROCKSDB_NAMESPACE
|
||||
|
||||
|
|
|
@ -559,38 +559,44 @@ size_t TestFSRandomAccessFile::GetUniqueId(char* id, size_t max_size) const {
|
|||
}
|
||||
}
|
||||
|
||||
void FaultInjectionTestFS::AddUnsyncedToRead(const std::string& fname,
|
||||
size_t pos, size_t n,
|
||||
Slice* result, char* scratch) {
|
||||
// Should be checked prior
|
||||
assert(result->size() < n);
|
||||
size_t pos_after = pos + result->size();
|
||||
namespace {
|
||||
// Modifies `result` to start at the beginning of `scratch` if not already,
|
||||
// copying data there if needed.
|
||||
void MoveToScratchIfNeeded(Slice* result, char* scratch) {
|
||||
if (result->data() != scratch) {
|
||||
// NOTE: might overlap
|
||||
std::copy_n(result->data(), result->size(), scratch);
|
||||
}
|
||||
*result = Slice(scratch, result->size());
|
||||
}
|
||||
} // namespace
|
||||
|
||||
void FaultInjectionTestFS::ReadUnsynced(const std::string& fname,
|
||||
uint64_t offset, size_t n,
|
||||
Slice* result, char* scratch,
|
||||
int64_t* pos_at_last_sync) {
|
||||
*result = Slice(scratch, 0); // default empty result
|
||||
|
||||
MutexLock l(&mutex_);
|
||||
auto it = db_file_state_.find(fname);
|
||||
if (it != db_file_state_.end()) {
|
||||
auto& st = it->second;
|
||||
if (st.pos_at_last_append_ > static_cast<ssize_t>(pos_after)) {
|
||||
size_t remaining_requested = n - result->size();
|
||||
size_t to_copy =
|
||||
std::min(remaining_requested,
|
||||
static_cast<size_t>(st.pos_at_last_append_) - pos_after);
|
||||
size_t buffer_offset = pos_after - static_cast<size_t>(std::max(
|
||||
st.pos_at_last_sync_, ssize_t{0}));
|
||||
// Data might have been dropped from buffer
|
||||
if (st.buffer_.size() > buffer_offset) {
|
||||
to_copy = std::min(to_copy, st.buffer_.size() - buffer_offset);
|
||||
if (result->data() != scratch) {
|
||||
// TODO: this will be needed when supporting random reads
|
||||
// but not currently used
|
||||
abort();
|
||||
// NOTE: might overlap
|
||||
// std::copy_n(result->data(), result->size(), scratch);
|
||||
}
|
||||
std::copy_n(st.buffer_.data() + buffer_offset, to_copy,
|
||||
scratch + result->size());
|
||||
*result = Slice(scratch, result->size() + to_copy);
|
||||
}
|
||||
// Use 0 to mean "tracked but nothing synced"
|
||||
*pos_at_last_sync = std::max(st.pos_at_last_sync_, int64_t{0});
|
||||
// Find overlap between [offset, offset + n) and
|
||||
// [*pos_at_last_sync, *pos_at_last_sync + st.buffer_.size())
|
||||
int64_t begin = std::max(static_cast<int64_t>(offset), *pos_at_last_sync);
|
||||
int64_t end =
|
||||
std::min(static_cast<int64_t>(offset + n),
|
||||
*pos_at_last_sync + static_cast<int64_t>(st.buffer_.size()));
|
||||
|
||||
// Copy and return overlap if there is any
|
||||
if (begin < end) {
|
||||
size_t offset_in_buffer = static_cast<size_t>(begin - *pos_at_last_sync);
|
||||
size_t offset_in_scratch = static_cast<size_t>(begin - offset);
|
||||
std::copy_n(st.buffer_.data() + offset_in_buffer, end - begin,
|
||||
scratch + offset_in_scratch);
|
||||
*result = Slice(scratch + offset_in_scratch, end - begin);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -606,13 +612,137 @@ IOStatus TestFSSequentialFile::Read(size_t n, const IOOptions& options,
|
|||
return s;
|
||||
}
|
||||
|
||||
s = target()->Read(n, options, result, scratch, dbg);
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
// Some complex logic is needed to deal with concurrent write to the same
|
||||
// file, while keeping good performance (e.g. not holding FS mutex during
|
||||
// I/O op), especially in common cases.
|
||||
|
||||
if (read_pos_ == target_read_pos_) {
|
||||
// Normal case: start by reading from underlying file
|
||||
s = target()->Read(n, options, result, scratch, dbg);
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
target_read_pos_ += result->size();
|
||||
} else {
|
||||
// We must have previously read buffered data (unsynced) not written to
|
||||
// target. Deal with this case (and more) below.
|
||||
*result = {};
|
||||
}
|
||||
|
||||
if (fs_->ReadUnsyncedData() && result->size() < n) {
|
||||
fs_->AddUnsyncedToRead(fname_, read_pos_, n, result, scratch);
|
||||
// We need to check if there's unsynced data to fill out the rest of the
|
||||
// read.
|
||||
|
||||
// First, ensure target read data is in scratch for easy handling.
|
||||
MoveToScratchIfNeeded(result, scratch);
|
||||
assert(result->data() == scratch);
|
||||
|
||||
// If we just did a target Read, we only want unsynced data after it
|
||||
// (target_read_pos_). Otherwise (e.g. if target is behind because of
|
||||
// unsynced data) we want unsynced data starting at the current read pos
|
||||
// (read_pos_, not yet updated).
|
||||
const uint64_t unsynced_read_pos = std::max(target_read_pos_, read_pos_);
|
||||
const size_t offset_from_read_pos =
|
||||
static_cast<size_t>(unsynced_read_pos - read_pos_);
|
||||
Slice unsynced_result;
|
||||
int64_t pos_at_last_sync = -1;
|
||||
fs_->ReadUnsynced(fname_, unsynced_read_pos, n - offset_from_read_pos,
|
||||
&unsynced_result, scratch + offset_from_read_pos,
|
||||
&pos_at_last_sync);
|
||||
assert(unsynced_result.data() >= scratch + offset_from_read_pos);
|
||||
assert(unsynced_result.data() < scratch + n);
|
||||
// Now, there are several cases to consider (some grouped together):
|
||||
if (pos_at_last_sync <= static_cast<int64_t>(unsynced_read_pos)) {
|
||||
// 1. We didn't get any unsynced data because nothing has been written
|
||||
// to the file beyond unsynced_read_pos (including untracked
|
||||
// pos_at_last_sync == -1)
|
||||
// 2. We got some unsynced data starting at unsynced_read_pos (possibly
|
||||
// on top of some synced data from target). We don't need to try reading
|
||||
// any more from target because we established a "point in time" for
|
||||
// completing this Read in which we read as much tail data (unsynced) as
|
||||
// we could.
|
||||
assert(pos_at_last_sync >= 0 || unsynced_result.size() == 0);
|
||||
|
||||
// Combined data is already lined up in scratch.
|
||||
assert(result->data() + result->size() == unsynced_result.data());
|
||||
assert(result->size() + unsynced_result.size() <= n);
|
||||
// Combine results
|
||||
*result = Slice(result->data(), result->size() + unsynced_result.size());
|
||||
} else {
|
||||
// 3. Any unsynced data we got was after unsynced_read_pos because the
|
||||
// file was synced some time since our last target Read (either from this
|
||||
// Read or a prior Read). We need to read more data from target to ensure
|
||||
// this Read is filled out, even though we might have already read some
|
||||
// (but not all due to a race). This code handles:
|
||||
//
|
||||
// * Catching up target after prior read(s) of unsynced data
|
||||
// * Racing Sync in another thread since we called target Read above
|
||||
//
|
||||
// And merging potentially three results together for this Read:
|
||||
// * The original target Read above
|
||||
// * The following (non-throw-away) target Read
|
||||
// * The ReadUnsynced above, which is always last if it returned data,
|
||||
// so that we have a "point in time" for completing this Read in which we
|
||||
// read as much tail data (unsynced) as we could.
|
||||
//
|
||||
// Deeper note about the race: we cannot just treat the original target
|
||||
// Read as a "point in time" view of available data in the file, because
|
||||
// there might have been unsynced data at that time, which became synced
|
||||
// data by the time we read unsynced data. That is the race we are
|
||||
// resolving with this "double check"-style code.
|
||||
const size_t supplemental_read_pos = unsynced_read_pos;
|
||||
|
||||
// First, if there's any data from target that we know we would need to
|
||||
// throw away to catch up, try to do it.
|
||||
if (target_read_pos_ < supplemental_read_pos) {
|
||||
Slice throw_away_result;
|
||||
size_t throw_away_n = supplemental_read_pos - target_read_pos_;
|
||||
std::unique_ptr<char[]> throw_away_scratch{new char[throw_away_n]};
|
||||
s = target()->Read(throw_away_n, options, &throw_away_result,
|
||||
throw_away_scratch.get(), dbg);
|
||||
if (!s.ok()) {
|
||||
read_pos_ += result->size();
|
||||
return s;
|
||||
}
|
||||
target_read_pos_ += throw_away_result.size();
|
||||
if (target_read_pos_ < supplemental_read_pos) {
|
||||
// Because of pos_at_last_sync > supplemental_read_pos, we should
|
||||
// have been able to catch up
|
||||
read_pos_ += result->size();
|
||||
return IOStatus::IOError(
|
||||
"Unexpected truncation or short read of file " + fname_);
|
||||
}
|
||||
}
|
||||
// Now we can do a productive supplemental Read from target
|
||||
assert(target_read_pos_ == supplemental_read_pos);
|
||||
Slice supplemental_result;
|
||||
size_t supplemental_n =
|
||||
unsynced_result.size() == 0
|
||||
? n - offset_from_read_pos
|
||||
: unsynced_result.data() - (scratch + offset_from_read_pos);
|
||||
s = target()->Read(supplemental_n, options, &supplemental_result,
|
||||
scratch + offset_from_read_pos, dbg);
|
||||
if (!s.ok()) {
|
||||
read_pos_ += result->size();
|
||||
return s;
|
||||
}
|
||||
target_read_pos_ += supplemental_result.size();
|
||||
MoveToScratchIfNeeded(&supplemental_result,
|
||||
scratch + offset_from_read_pos);
|
||||
|
||||
// Combined data is already lined up in scratch.
|
||||
assert(result->data() + result->size() == supplemental_result.data());
|
||||
assert(unsynced_result.size() == 0 ||
|
||||
supplemental_result.data() + supplemental_result.size() ==
|
||||
unsynced_result.data());
|
||||
assert(result->size() + supplemental_result.size() +
|
||||
unsynced_result.size() <=
|
||||
n);
|
||||
// Combine results
|
||||
*result =
|
||||
Slice(result->data(), result->size() + supplemental_result.size() +
|
||||
unsynced_result.size());
|
||||
}
|
||||
}
|
||||
read_pos_ += result->size();
|
||||
|
||||
|
|
|
@ -41,8 +41,8 @@ enum class FaultInjectionIOType {
|
|||
|
||||
struct FSFileState {
|
||||
std::string filename_;
|
||||
ssize_t pos_at_last_append_;
|
||||
ssize_t pos_at_last_sync_;
|
||||
int64_t pos_at_last_append_;
|
||||
int64_t pos_at_last_sync_;
|
||||
std::string buffer_;
|
||||
|
||||
explicit FSFileState(const std::string& filename)
|
||||
|
@ -178,7 +178,8 @@ class TestFSSequentialFile : public FSSequentialFileOwnerWrapper {
|
|||
private:
|
||||
FaultInjectionTestFS* fs_;
|
||||
std::string fname_;
|
||||
size_t read_pos_ = 0;
|
||||
uint64_t read_pos_ = 0;
|
||||
uint64_t target_read_pos_ = 0;
|
||||
};
|
||||
|
||||
class TestFSDirectory : public FSDirectory {
|
||||
|
@ -548,8 +549,14 @@ class FaultInjectionTestFS : public FileSystemWrapper {
|
|||
|
||||
void PrintInjectedThreadLocalErrorBacktrace(FaultInjectionIOType type);
|
||||
|
||||
void AddUnsyncedToRead(const std::string& fname, size_t offset, size_t n,
|
||||
Slice* result, char* scratch);
|
||||
// If there is unsynced data in the specified file within the specified
|
||||
// range [offset, offset + n), return the unsynced data overlapping with
|
||||
// that range, in a corresponding range of scratch. When known, also return
|
||||
// the position of the last sync, so that the caller can determine whether
|
||||
// more data is available from the target file when not available from
|
||||
// unsynced.
|
||||
void ReadUnsynced(const std::string& fname, uint64_t offset, size_t n,
|
||||
Slice* result, char* scratch, int64_t* pos_at_last_sync);
|
||||
|
||||
private:
|
||||
port::Mutex mutex_;
|
||||
|
|
Loading…
Reference in New Issue