Support read & write with unsynced data in FaultInjectionTestFS (#12852)

Summary:
Follow-up to https://github.com/facebook/rocksdb/issues/12729 and others to fix FaultInjectionTestFS handling the case where a live WAL is being appended to and synced while also being copied for checkpoint or backup, up to a known flushed (but not necessarily synced) prefix of the file. It was tricky to structure the code in a way that could handle a tricky race with Sync in another thread (see code comments, thanks Changyu) while maintaining good performance and test-ability.

For more context, see the call to FlushWAL() in DBImpl::GetLiveFilesStorageInfo().

Also, the unit test for https://github.com/facebook/rocksdb/issues/12729 was neutered by https://github.com/facebook/rocksdb/issues/12797, and this re-enables the functionality it is testing.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/12852

Test Plan:
unit test expanded/updated. Local runs of blackbox_crash_test.

The implementation is structured so that a multi-threaded unit test is not needed to cover at least the code lines, as the race handling is folded into "catch up after returning unsynced and then a sync."

Reviewed By: cbi42

Differential Revision: D59594045

Pulled By: pdillinger

fbshipit-source-id: 94667bb72255e2952586c53bae2c2dd384e85a50
This commit is contained in:
Peter Dillinger 2024-07-12 16:01:57 -07:00 committed by Facebook GitHub Bot
parent 3db030d7ee
commit 72438a6788
3 changed files with 228 additions and 62 deletions

View File

@ -567,49 +567,78 @@ class FaultInjectionDBTest : public DBTestBase {
TEST(FaultInjectionFSTest, ReadUnsyncedData) {
std::shared_ptr<FaultInjectionTestFS> fault_fs =
std::make_shared<FaultInjectionTestFS>(FileSystem::Default());
fault_fs->SetInjectUnsyncedDataLoss(true);
ASSERT_TRUE(fault_fs->ReadUnsyncedData());
ASSERT_TRUE(fault_fs->InjectUnsyncedDataLoss());
// This is a randomized mini-stress test, to reduce the chances of bugs in
// FaultInjectionTestFS being caught only in db_stress, where they are
// difficult to debug.
uint32_t len = Random::GetTLSInstance()->Uniform(10000) + 1;
Random rnd(len);
// Create partially synced file
std::string f = test::PerThreadDBPath("read_unsynced.data");
std::string data = rnd.RandomString(len);
{
std::unique_ptr<FSWritableFile> w;
ASSERT_OK(fault_fs->NewWritableFile(f, {}, &w, nullptr));
uint32_t synced_len = rnd.Uniform(len + 1);
ASSERT_OK(w->Append(Slice(data.data(), synced_len), {}, nullptr));
// Create partially synced file
std::unique_ptr<FSWritableFile> w;
ASSERT_OK(fault_fs->NewWritableFile(f, {}, &w, nullptr));
uint32_t synced_len = rnd.Uniform(len + 1);
ASSERT_OK(w->Append(Slice(data.data(), synced_len), {}, nullptr));
if (synced_len > 0) {
ASSERT_OK(w->Sync({}, nullptr));
ASSERT_OK(w->Append(Slice(data.data() + synced_len, len - synced_len), {},
nullptr));
// no sync here
ASSERT_OK(w->Close({}, nullptr));
}
ASSERT_OK(w->Append(Slice(data.data() + synced_len, len - synced_len), {},
nullptr));
// Test file size includes unsynced data
{
uint64_t file_size;
ASSERT_OK(fault_fs->GetFileSize(f, {}, &file_size, nullptr));
ASSERT_EQ(len, file_size);
}
// Test read file contents, with two reads that probably don't
// align with the unsynced split.
{
std::unique_ptr<FSSequentialFile> r;
ASSERT_OK(fault_fs->NewSequentialFile(f, {}, &r, nullptr));
uint32_t first_read_len = rnd.Uniform(len + 1);
Slice sl;
std::unique_ptr<char[]> scratch(new char[first_read_len]);
ASSERT_OK(r->Read(first_read_len, {}, &sl, scratch.get(), nullptr));
ASSERT_EQ(first_read_len, sl.size());
ASSERT_EQ(0, sl.compare(Slice(data.data(), first_read_len)));
uint32_t second_read_len = len - first_read_len;
scratch.reset(new char[second_read_len]);
ASSERT_OK(r->Read(second_read_len, {}, &sl, scratch.get(), nullptr));
ASSERT_EQ(second_read_len, sl.size());
ASSERT_EQ(0,
sl.compare(Slice(data.data() + first_read_len, second_read_len)));
// align with the unsynced split. And maybe a sync or write between
// the two reads.
std::unique_ptr<FSSequentialFile> r;
ASSERT_OK(fault_fs->NewSequentialFile(f, {}, &r, nullptr));
uint32_t first_read_len = rnd.Uniform(len + 1);
Slice sl;
std::unique_ptr<char[]> scratch(new char[first_read_len]);
ASSERT_OK(r->Read(first_read_len, {}, &sl, scratch.get(), nullptr));
ASSERT_EQ(first_read_len, sl.size());
ASSERT_EQ(0, sl.compare(Slice(data.data(), first_read_len)));
// Maybe a sync and/or write and/or close between the two reads.
if (rnd.OneIn(2)) {
ASSERT_OK(w->Sync({}, nullptr));
}
if (rnd.OneIn(2)) {
uint32_t more_len = rnd.Uniform(1000) + 1;
std::string more_data = rnd.RandomString(more_len);
ASSERT_OK(w->Append(more_data, {}, nullptr));
data += more_data;
len += more_len;
}
if (rnd.OneIn(2)) {
ASSERT_OK(w->Sync({}, nullptr));
}
if (rnd.OneIn(2)) {
ASSERT_OK(w->Close({}, nullptr));
w.reset();
}
// Second read some of, all of, or more than rest of file
uint32_t second_read_len = rnd.Uniform(len + 1);
scratch.reset(new char[second_read_len]);
ASSERT_OK(r->Read(second_read_len, {}, &sl, scratch.get(), nullptr));
if (len - first_read_len < second_read_len) {
ASSERT_EQ(len - first_read_len, sl.size());
} else {
ASSERT_EQ(second_read_len, sl.size());
}
ASSERT_EQ(0, sl.compare(Slice(data.data() + first_read_len, sl.size())));
}
} // namespace ROCKSDB_NAMESPACE

View File

@ -559,38 +559,44 @@ size_t TestFSRandomAccessFile::GetUniqueId(char* id, size_t max_size) const {
}
}
void FaultInjectionTestFS::AddUnsyncedToRead(const std::string& fname,
size_t pos, size_t n,
Slice* result, char* scratch) {
// Should be checked prior
assert(result->size() < n);
size_t pos_after = pos + result->size();
namespace {
// Modifies `result` to start at the beginning of `scratch` if not already,
// copying data there if needed.
void MoveToScratchIfNeeded(Slice* result, char* scratch) {
if (result->data() != scratch) {
// NOTE: might overlap
std::copy_n(result->data(), result->size(), scratch);
}
*result = Slice(scratch, result->size());
}
} // namespace
void FaultInjectionTestFS::ReadUnsynced(const std::string& fname,
uint64_t offset, size_t n,
Slice* result, char* scratch,
int64_t* pos_at_last_sync) {
*result = Slice(scratch, 0); // default empty result
MutexLock l(&mutex_);
auto it = db_file_state_.find(fname);
if (it != db_file_state_.end()) {
auto& st = it->second;
if (st.pos_at_last_append_ > static_cast<ssize_t>(pos_after)) {
size_t remaining_requested = n - result->size();
size_t to_copy =
std::min(remaining_requested,
static_cast<size_t>(st.pos_at_last_append_) - pos_after);
size_t buffer_offset = pos_after - static_cast<size_t>(std::max(
st.pos_at_last_sync_, ssize_t{0}));
// Data might have been dropped from buffer
if (st.buffer_.size() > buffer_offset) {
to_copy = std::min(to_copy, st.buffer_.size() - buffer_offset);
if (result->data() != scratch) {
// TODO: this will be needed when supporting random reads
// but not currently used
abort();
// NOTE: might overlap
// std::copy_n(result->data(), result->size(), scratch);
}
std::copy_n(st.buffer_.data() + buffer_offset, to_copy,
scratch + result->size());
*result = Slice(scratch, result->size() + to_copy);
}
// Use 0 to mean "tracked but nothing synced"
*pos_at_last_sync = std::max(st.pos_at_last_sync_, int64_t{0});
// Find overlap between [offset, offset + n) and
// [*pos_at_last_sync, *pos_at_last_sync + st.buffer_.size())
int64_t begin = std::max(static_cast<int64_t>(offset), *pos_at_last_sync);
int64_t end =
std::min(static_cast<int64_t>(offset + n),
*pos_at_last_sync + static_cast<int64_t>(st.buffer_.size()));
// Copy and return overlap if there is any
if (begin < end) {
size_t offset_in_buffer = static_cast<size_t>(begin - *pos_at_last_sync);
size_t offset_in_scratch = static_cast<size_t>(begin - offset);
std::copy_n(st.buffer_.data() + offset_in_buffer, end - begin,
scratch + offset_in_scratch);
*result = Slice(scratch + offset_in_scratch, end - begin);
}
}
}
@ -606,13 +612,137 @@ IOStatus TestFSSequentialFile::Read(size_t n, const IOOptions& options,
return s;
}
s = target()->Read(n, options, result, scratch, dbg);
if (!s.ok()) {
return s;
// Some complex logic is needed to deal with concurrent write to the same
// file, while keeping good performance (e.g. not holding FS mutex during
// I/O op), especially in common cases.
if (read_pos_ == target_read_pos_) {
// Normal case: start by reading from underlying file
s = target()->Read(n, options, result, scratch, dbg);
if (!s.ok()) {
return s;
}
target_read_pos_ += result->size();
} else {
// We must have previously read buffered data (unsynced) not written to
// target. Deal with this case (and more) below.
*result = {};
}
if (fs_->ReadUnsyncedData() && result->size() < n) {
fs_->AddUnsyncedToRead(fname_, read_pos_, n, result, scratch);
// We need to check if there's unsynced data to fill out the rest of the
// read.
// First, ensure target read data is in scratch for easy handling.
MoveToScratchIfNeeded(result, scratch);
assert(result->data() == scratch);
// If we just did a target Read, we only want unsynced data after it
// (target_read_pos_). Otherwise (e.g. if target is behind because of
// unsynced data) we want unsynced data starting at the current read pos
// (read_pos_, not yet updated).
const uint64_t unsynced_read_pos = std::max(target_read_pos_, read_pos_);
const size_t offset_from_read_pos =
static_cast<size_t>(unsynced_read_pos - read_pos_);
Slice unsynced_result;
int64_t pos_at_last_sync = -1;
fs_->ReadUnsynced(fname_, unsynced_read_pos, n - offset_from_read_pos,
&unsynced_result, scratch + offset_from_read_pos,
&pos_at_last_sync);
assert(unsynced_result.data() >= scratch + offset_from_read_pos);
assert(unsynced_result.data() < scratch + n);
// Now, there are several cases to consider (some grouped together):
if (pos_at_last_sync <= static_cast<int64_t>(unsynced_read_pos)) {
// 1. We didn't get any unsynced data because nothing has been written
// to the file beyond unsynced_read_pos (including untracked
// pos_at_last_sync == -1)
// 2. We got some unsynced data starting at unsynced_read_pos (possibly
// on top of some synced data from target). We don't need to try reading
// any more from target because we established a "point in time" for
// completing this Read in which we read as much tail data (unsynced) as
// we could.
assert(pos_at_last_sync >= 0 || unsynced_result.size() == 0);
// Combined data is already lined up in scratch.
assert(result->data() + result->size() == unsynced_result.data());
assert(result->size() + unsynced_result.size() <= n);
// Combine results
*result = Slice(result->data(), result->size() + unsynced_result.size());
} else {
// 3. Any unsynced data we got was after unsynced_read_pos because the
// file was synced some time since our last target Read (either from this
// Read or a prior Read). We need to read more data from target to ensure
// this Read is filled out, even though we might have already read some
// (but not all due to a race). This code handles:
//
// * Catching up target after prior read(s) of unsynced data
// * Racing Sync in another thread since we called target Read above
//
// And merging potentially three results together for this Read:
// * The original target Read above
// * The following (non-throw-away) target Read
// * The ReadUnsynced above, which is always last if it returned data,
// so that we have a "point in time" for completing this Read in which we
// read as much tail data (unsynced) as we could.
//
// Deeper note about the race: we cannot just treat the original target
// Read as a "point in time" view of available data in the file, because
// there might have been unsynced data at that time, which became synced
// data by the time we read unsynced data. That is the race we are
// resolving with this "double check"-style code.
const size_t supplemental_read_pos = unsynced_read_pos;
// First, if there's any data from target that we know we would need to
// throw away to catch up, try to do it.
if (target_read_pos_ < supplemental_read_pos) {
Slice throw_away_result;
size_t throw_away_n = supplemental_read_pos - target_read_pos_;
std::unique_ptr<char[]> throw_away_scratch{new char[throw_away_n]};
s = target()->Read(throw_away_n, options, &throw_away_result,
throw_away_scratch.get(), dbg);
if (!s.ok()) {
read_pos_ += result->size();
return s;
}
target_read_pos_ += throw_away_result.size();
if (target_read_pos_ < supplemental_read_pos) {
// Because of pos_at_last_sync > supplemental_read_pos, we should
// have been able to catch up
read_pos_ += result->size();
return IOStatus::IOError(
"Unexpected truncation or short read of file " + fname_);
}
}
// Now we can do a productive supplemental Read from target
assert(target_read_pos_ == supplemental_read_pos);
Slice supplemental_result;
size_t supplemental_n =
unsynced_result.size() == 0
? n - offset_from_read_pos
: unsynced_result.data() - (scratch + offset_from_read_pos);
s = target()->Read(supplemental_n, options, &supplemental_result,
scratch + offset_from_read_pos, dbg);
if (!s.ok()) {
read_pos_ += result->size();
return s;
}
target_read_pos_ += supplemental_result.size();
MoveToScratchIfNeeded(&supplemental_result,
scratch + offset_from_read_pos);
// Combined data is already lined up in scratch.
assert(result->data() + result->size() == supplemental_result.data());
assert(unsynced_result.size() == 0 ||
supplemental_result.data() + supplemental_result.size() ==
unsynced_result.data());
assert(result->size() + supplemental_result.size() +
unsynced_result.size() <=
n);
// Combine results
*result =
Slice(result->data(), result->size() + supplemental_result.size() +
unsynced_result.size());
}
}
read_pos_ += result->size();

View File

@ -41,8 +41,8 @@ enum class FaultInjectionIOType {
struct FSFileState {
std::string filename_;
ssize_t pos_at_last_append_;
ssize_t pos_at_last_sync_;
int64_t pos_at_last_append_;
int64_t pos_at_last_sync_;
std::string buffer_;
explicit FSFileState(const std::string& filename)
@ -178,7 +178,8 @@ class TestFSSequentialFile : public FSSequentialFileOwnerWrapper {
private:
FaultInjectionTestFS* fs_;
std::string fname_;
size_t read_pos_ = 0;
uint64_t read_pos_ = 0;
uint64_t target_read_pos_ = 0;
};
class TestFSDirectory : public FSDirectory {
@ -548,8 +549,14 @@ class FaultInjectionTestFS : public FileSystemWrapper {
void PrintInjectedThreadLocalErrorBacktrace(FaultInjectionIOType type);
void AddUnsyncedToRead(const std::string& fname, size_t offset, size_t n,
Slice* result, char* scratch);
// If there is unsynced data in the specified file within the specified
// range [offset, offset + n), return the unsynced data overlapping with
// that range, in a corresponding range of scratch. When known, also return
// the position of the last sync, so that the caller can determine whether
// more data is available from the target file when not available from
// unsynced.
void ReadUnsynced(const std::string& fname, uint64_t offset, size_t n,
Slice* result, char* scratch, int64_t* pos_at_last_sync);
private:
port::Mutex mutex_;