mirror of
https://github.com/facebook/rocksdb.git
synced 2024-11-29 00:34:03 +00:00
Fix and defend against FilePrefetchBuffer combined with mmap reads (#12206)
Summary: FilePrefetchBuffer makes an unchecked assumption about the behavior of RandomAccessFileReader::Read: that it will write to the provided buffer rather than returning the data in an alternate buffer. FilePrefetchBuffer has been quietly incompatible with mmap reads (e.g. allow_mmap_reads / use_mmap_reads) because in that case an alternate buffer is returned (mmapped memory). This incompatibility currently leads to quiet data corruption, as seen in amplified crash test failure in https://github.com/facebook/rocksdb/issues/12200. In this change, * Check whether RandomAccessFileReader::Read has the expected behavior, and fail if not. (Assertion failure in debug build, return Corruption in release build.) This will detect future regressions synchronously and precisely, rather than relying on debugging downstream data corruption. * Why not recover? My understanding is that FilePrefetchBuffer is not intended for use when RandomAccessFileReader::Read uses an alternate buffer, so quietly recovering could lead to undesirable (inefficient) behavior. * Mention incompatibility with mmap-based readers in the internal API comments for FilePrefetchBuffer * Fix two cases where FilePrefetchBuffer could be used with mmap, both stemming from SstFileDumper, though one fix is in BlockBasedTableReader. There is currently no way to ask a RandomAccessFileReader whether it's using mmap, so we currently have to rely on other options as clues. Keeping separate from https://github.com/facebook/rocksdb/issues/12200 in part because this change is more appropriate for backport than that one. Pull Request resolved: https://github.com/facebook/rocksdb/pull/12206 Test Plan: * Manually verified that the new check aids in debugging. * Unit test added, that fails if either fix is missed. * Ran blackbox_crash_test for hours, with and without https://github.com/facebook/rocksdb/issues/12200 Reviewed By: akankshamahajan15 Differential Revision: D52551701 Pulled By: pdillinger fbshipit-source-id: dea87c5782b7c484a6c6e424585c8832dfc580dc
This commit is contained in:
parent
f11a0237b6
commit
ed46981bea
|
@ -84,9 +84,9 @@ Status FilePrefetchBuffer::Read(const IOOptions& opts,
|
||||||
uint64_t read_len, uint64_t chunk_len,
|
uint64_t read_len, uint64_t chunk_len,
|
||||||
uint64_t start_offset, uint32_t index) {
|
uint64_t start_offset, uint32_t index) {
|
||||||
Slice result;
|
Slice result;
|
||||||
|
char* to_buf = bufs_[index].buffer_.BufferStart() + chunk_len;
|
||||||
Status s = reader->Read(opts, start_offset + chunk_len, read_len, &result,
|
Status s = reader->Read(opts, start_offset + chunk_len, read_len, &result,
|
||||||
bufs_[index].buffer_.BufferStart() + chunk_len,
|
to_buf, /*aligned_buf=*/nullptr);
|
||||||
/*aligned_buf=*/nullptr);
|
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
if (result.size() < read_len) {
|
if (result.size() < read_len) {
|
||||||
// Fake an IO error to force db_stress fault injection to ignore
|
// Fake an IO error to force db_stress fault injection to ignore
|
||||||
|
@ -97,6 +97,13 @@ Status FilePrefetchBuffer::Read(const IOOptions& opts,
|
||||||
if (!s.ok()) {
|
if (!s.ok()) {
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
if (result.data() != to_buf) {
|
||||||
|
// If the read is coming from some other buffer already in memory (such as
|
||||||
|
// mmap) then it would be inefficient to create another copy in this
|
||||||
|
// FilePrefetchBuffer. The caller is expected to exclude this case.
|
||||||
|
assert(false);
|
||||||
|
return Status::Corruption("File read didn't populate our buffer");
|
||||||
|
}
|
||||||
|
|
||||||
if (usage_ == FilePrefetchBufferUsage::kUserScanPrefetch) {
|
if (usage_ == FilePrefetchBufferUsage::kUserScanPrefetch) {
|
||||||
RecordTick(stats_, PREFETCH_BYTES, read_len);
|
RecordTick(stats_, PREFETCH_BYTES, read_len);
|
||||||
|
|
|
@ -91,7 +91,8 @@ class FilePrefetchBuffer {
|
||||||
// max_readahead_size should be greater than equal to readahead_size.
|
// max_readahead_size should be greater than equal to readahead_size.
|
||||||
// enable : controls whether reading from the buffer is enabled.
|
// enable : controls whether reading from the buffer is enabled.
|
||||||
// If false, TryReadFromCache() always return false, and we only take stats
|
// If false, TryReadFromCache() always return false, and we only take stats
|
||||||
// for the minimum offset if track_min_offset = true.
|
// for the minimum offset if track_min_offset = true. See below NOTE about
|
||||||
|
// mmap reads.
|
||||||
// track_min_offset : Track the minimum offset ever read and collect stats on
|
// track_min_offset : Track the minimum offset ever read and collect stats on
|
||||||
// it. Used for adaptable readahead of the file footer/metadata.
|
// it. Used for adaptable readahead of the file footer/metadata.
|
||||||
// implicit_auto_readahead : Readahead is enabled implicitly by rocksdb after
|
// implicit_auto_readahead : Readahead is enabled implicitly by rocksdb after
|
||||||
|
@ -101,6 +102,9 @@ class FilePrefetchBuffer {
|
||||||
// and max_readahead_size are passed in.
|
// and max_readahead_size are passed in.
|
||||||
// A user can construct a FilePrefetchBuffer without any arguments, but use
|
// A user can construct a FilePrefetchBuffer without any arguments, but use
|
||||||
// `Prefetch` to load data into the buffer.
|
// `Prefetch` to load data into the buffer.
|
||||||
|
// NOTE: FilePrefetchBuffer is incompatible with prefetching from
|
||||||
|
// RandomAccessFileReaders using mmap reads, so it is common to use
|
||||||
|
// `!use_mmap_reads` for the `enable` parameter.
|
||||||
FilePrefetchBuffer(
|
FilePrefetchBuffer(
|
||||||
size_t readahead_size = 0, size_t max_readahead_size = 0,
|
size_t readahead_size = 0, size_t max_readahead_size = 0,
|
||||||
bool enable = true, bool track_min_offset = false,
|
bool enable = true, bool track_min_offset = false,
|
||||||
|
|
|
@ -601,7 +601,7 @@ Status BlockBasedTable::Open(
|
||||||
const bool prefetch_all = prefetch_index_and_filter_in_cache || level == 0;
|
const bool prefetch_all = prefetch_index_and_filter_in_cache || level == 0;
|
||||||
const bool preload_all = !table_options.cache_index_and_filter_blocks;
|
const bool preload_all = !table_options.cache_index_and_filter_blocks;
|
||||||
|
|
||||||
if (!ioptions.allow_mmap_reads) {
|
if (!ioptions.allow_mmap_reads && !env_options.use_mmap_reads) {
|
||||||
s = PrefetchTail(ro, file.get(), file_size, force_direct_prefetch,
|
s = PrefetchTail(ro, file.get(), file_size, force_direct_prefetch,
|
||||||
tail_prefetch_stats, prefetch_all, preload_all,
|
tail_prefetch_stats, prefetch_all, preload_all,
|
||||||
&prefetch_buffer, ioptions.stats, tail_size,
|
&prefetch_buffer, ioptions.stats, tail_size,
|
||||||
|
|
|
@ -162,7 +162,7 @@ class BlockBasedTableReaderBaseTest : public testing::Test {
|
||||||
bool user_defined_timestamps_persisted = true) {
|
bool user_defined_timestamps_persisted = true) {
|
||||||
const MutableCFOptions moptions(options_);
|
const MutableCFOptions moptions(options_);
|
||||||
TableReaderOptions table_reader_options = TableReaderOptions(
|
TableReaderOptions table_reader_options = TableReaderOptions(
|
||||||
ioptions, moptions.prefix_extractor, EnvOptions(), comparator,
|
ioptions, moptions.prefix_extractor, foptions, comparator,
|
||||||
0 /* block_protection_bytes_per_key */, false /* _skip_filters */,
|
0 /* block_protection_bytes_per_key */, false /* _skip_filters */,
|
||||||
false /* _immortal */, false /* _force_direct_prefetch */,
|
false /* _immortal */, false /* _force_direct_prefetch */,
|
||||||
-1 /* _level */, nullptr /* _block_cache_tracer */,
|
-1 /* _level */, nullptr /* _block_cache_tracer */,
|
||||||
|
|
|
@ -104,8 +104,8 @@ Status SstFileDumper::GetTableReader(const std::string& file_path) {
|
||||||
file_.reset(new RandomAccessFileReader(std::move(file), file_path));
|
file_.reset(new RandomAccessFileReader(std::move(file), file_path));
|
||||||
|
|
||||||
FilePrefetchBuffer prefetch_buffer(
|
FilePrefetchBuffer prefetch_buffer(
|
||||||
0 /* readahead_size */, 0 /* max_readahead_size */, true /* enable */,
|
0 /* readahead_size */, 0 /* max_readahead_size */,
|
||||||
false /* track_min_offset */);
|
!fopts.use_mmap_reads /* enable */, false /* track_min_offset */);
|
||||||
if (s.ok()) {
|
if (s.ok()) {
|
||||||
const uint64_t kSstDumpTailPrefetchSize = 512 * 1024;
|
const uint64_t kSstDumpTailPrefetchSize = 512 * 1024;
|
||||||
uint64_t prefetch_size = (file_size > kSstDumpTailPrefetchSize)
|
uint64_t prefetch_size = (file_size > kSstDumpTailPrefetchSize)
|
||||||
|
|
|
@ -16,9 +16,11 @@
|
||||||
#include "rocksdb/filter_policy.h"
|
#include "rocksdb/filter_policy.h"
|
||||||
#include "rocksdb/sst_dump_tool.h"
|
#include "rocksdb/sst_dump_tool.h"
|
||||||
#include "table/block_based/block_based_table_factory.h"
|
#include "table/block_based/block_based_table_factory.h"
|
||||||
|
#include "table/sst_file_dumper.h"
|
||||||
#include "table/table_builder.h"
|
#include "table/table_builder.h"
|
||||||
#include "test_util/testharness.h"
|
#include "test_util/testharness.h"
|
||||||
#include "test_util/testutil.h"
|
#include "test_util/testutil.h"
|
||||||
|
#include "util/defer.h"
|
||||||
|
|
||||||
namespace ROCKSDB_NAMESPACE {
|
namespace ROCKSDB_NAMESPACE {
|
||||||
|
|
||||||
|
@ -485,6 +487,39 @@ TEST_F(SSTDumpToolTest, RawOutput) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST_F(SSTDumpToolTest, SstFileDumperMmapReads) {
|
||||||
|
Options opts;
|
||||||
|
opts.env = env();
|
||||||
|
std::string file_path = MakeFilePath("rocksdb_sst_test.sst");
|
||||||
|
createSST(opts, file_path, 10);
|
||||||
|
|
||||||
|
EnvOptions env_opts;
|
||||||
|
uint64_t data_size = 0;
|
||||||
|
|
||||||
|
// Test all combinations of mmap read options
|
||||||
|
for (int i = 0; i < 4; ++i) {
|
||||||
|
SaveAndRestore<bool> sar_opts(&opts.allow_mmap_reads, (i & 1) != 0);
|
||||||
|
SaveAndRestore<bool> sar_env_opts(&env_opts.use_mmap_reads, (i & 2) != 0);
|
||||||
|
|
||||||
|
SstFileDumper dumper(opts, file_path, Temperature::kUnknown,
|
||||||
|
1024 /*readahead_size*/, true /*verify_checksum*/,
|
||||||
|
false /*output_hex*/, false /*decode_blob_index*/,
|
||||||
|
env_opts);
|
||||||
|
ASSERT_OK(dumper.getStatus());
|
||||||
|
std::shared_ptr<const TableProperties> tp;
|
||||||
|
ASSERT_OK(dumper.ReadTableProperties(&tp));
|
||||||
|
ASSERT_NE(tp.get(), nullptr);
|
||||||
|
if (i == 0) {
|
||||||
|
// Verify consistency of a populated field with some entropy
|
||||||
|
data_size = tp->data_size;
|
||||||
|
ASSERT_GT(data_size, 0);
|
||||||
|
} else {
|
||||||
|
ASSERT_EQ(data_size, tp->data_size);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
cleanup(opts, file_path);
|
||||||
|
}
|
||||||
} // namespace ROCKSDB_NAMESPACE
|
} // namespace ROCKSDB_NAMESPACE
|
||||||
|
|
||||||
int main(int argc, char** argv) {
|
int main(int argc, char** argv) {
|
||||||
|
|
|
@ -0,0 +1 @@
|
||||||
|
Fixed some cases of in-memory data corruption using mmap reads with `BackupEngine`, `sst_dump`, or `ldb`.
|
Loading…
Reference in a new issue