mirror of https://github.com/facebook/rocksdb.git
Do readahead in VerifyChecksum() (#5713)
Summary: Right now VerifyChecksum() doesn't do read-ahead. In some use cases, users won't be able to achieve good performance. With this change, by default, RocksDB will do a default readahead, and users will be able to overwrite the readahead size by passing in a ReadOptions. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5713 Test Plan: Add a new unit test. Differential Revision: D16860874 fbshipit-source-id: 0cff0fe79ac855d3d068e6ccd770770854a68413
This commit is contained in:
parent
e89b1c9c6e
commit
e1c468d16f
|
@ -2,6 +2,8 @@
|
|||
## Unreleased
|
||||
### Bug Fixes
|
||||
* Fixed a number of data races in BlobDB.
|
||||
### New Features
|
||||
* VerifyChecksum() by default will issue readahead. Allow ReadOptions to be passed in to those functions to override the readhead size.
|
||||
|
||||
## 6.4.0 (7/30/2019)
|
||||
### Default Option Change
|
||||
|
|
2
Makefile
2
Makefile
|
@ -1226,7 +1226,7 @@ histogram_test: monitoring/histogram_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
|||
thread_local_test: util/thread_local_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(AM_LINK)
|
||||
|
||||
corruption_test: db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
corruption_test: db/corruption_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
$(AM_LINK)
|
||||
|
||||
crc32c_test: util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS)
|
||||
|
|
|
@ -35,6 +35,12 @@ Status DeleteFilesInRanges(DB* db, ColumnFamilyHandle* column_family,
|
|||
Status VerifySstFileChecksum(const Options& options,
|
||||
const EnvOptions& env_options,
|
||||
const std::string& file_path) {
|
||||
return VerifySstFileChecksum(options, env_options, ReadOptions(), file_path);
|
||||
}
|
||||
Status VerifySstFileChecksum(const Options& options,
|
||||
const EnvOptions& env_options,
|
||||
const ReadOptions& read_options,
|
||||
const std::string& file_path) {
|
||||
std::unique_ptr<RandomAccessFile> file;
|
||||
uint64_t file_size;
|
||||
InternalKeyComparator internal_comparator(options.comparator);
|
||||
|
@ -59,7 +65,8 @@ Status VerifySstFileChecksum(const Options& options,
|
|||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
s = table_reader->VerifyChecksum(TableReaderCaller::kUserVerifyChecksum);
|
||||
s = table_reader->VerifyChecksum(read_options,
|
||||
TableReaderCaller::kUserVerifyChecksum);
|
||||
return s;
|
||||
}
|
||||
|
||||
|
|
|
@ -13,10 +13,11 @@
|
|||
|
||||
#include <errno.h>
|
||||
#include <fcntl.h>
|
||||
#include <cinttypes>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/types.h>
|
||||
#include <cinttypes>
|
||||
#include "db/db_impl/db_impl.h"
|
||||
#include "db/db_test_util.h"
|
||||
#include "db/log_format.h"
|
||||
#include "db/version_set.h"
|
||||
#include "file/filename.h"
|
||||
|
@ -76,7 +77,11 @@ class CorruptionTest : public testing::Test {
|
|||
delete db_;
|
||||
db_ = nullptr;
|
||||
Options opt = (options ? *options : options_);
|
||||
opt.env = &env_;
|
||||
if (opt.env == Options().env) {
|
||||
// If env is not overridden, replace it with ErrorEnv.
|
||||
// Otherwise, the test already uses a non-default Env.
|
||||
opt.env = &env_;
|
||||
}
|
||||
opt.arena_block_size = 4096;
|
||||
BlockBasedTableOptions table_options;
|
||||
table_options.block_cache = tiny_cache_;
|
||||
|
@ -321,6 +326,52 @@ TEST_F(CorruptionTest, TableFile) {
|
|||
ASSERT_NOK(dbi->VerifyChecksum());
|
||||
}
|
||||
|
||||
TEST_F(CorruptionTest, VerifyChecksumReadahead) {
|
||||
Options options;
|
||||
SpecialEnv senv(Env::Default());
|
||||
options.env = &senv;
|
||||
// Disable block cache as we are going to check checksum for
|
||||
// the same file twice and measure number of reads.
|
||||
BlockBasedTableOptions table_options_no_bc;
|
||||
table_options_no_bc.no_block_cache = true;
|
||||
options.table_factory.reset(NewBlockBasedTableFactory(table_options_no_bc));
|
||||
|
||||
Reopen(&options);
|
||||
|
||||
Build(10000);
|
||||
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
|
||||
dbi->TEST_FlushMemTable();
|
||||
dbi->TEST_CompactRange(0, nullptr, nullptr);
|
||||
dbi->TEST_CompactRange(1, nullptr, nullptr);
|
||||
|
||||
senv.count_random_reads_ = true;
|
||||
senv.random_read_counter_.Reset();
|
||||
ASSERT_OK(dbi->VerifyChecksum());
|
||||
|
||||
// Make sure the counter is enabled.
|
||||
ASSERT_GT(senv.random_read_counter_.Read(), 0);
|
||||
|
||||
// The SST file is about 10MB. Default readahead size is 256KB.
|
||||
// Give a conservative 20 reads for metadata blocks, The number
|
||||
// of random reads should be within 10 MB / 256KB + 20 = 60.
|
||||
ASSERT_LT(senv.random_read_counter_.Read(), 60);
|
||||
|
||||
senv.random_read_bytes_counter_ = 0;
|
||||
ReadOptions ro;
|
||||
ro.readahead_size = size_t{32 * 1024};
|
||||
ASSERT_OK(dbi->VerifyChecksum(ro));
|
||||
// The SST file is about 10MB. We set readahead size to 32KB.
|
||||
// Give 0 to 20 reads for metadata blocks, and allow real read
|
||||
// to range from 24KB to 48KB. The lower bound would be:
|
||||
// 10MB / 48KB + 0 = 213
|
||||
// The higher bound is
|
||||
// 10MB / 24KB + 20 = 447.
|
||||
ASSERT_GE(senv.random_read_counter_.Read(), 213);
|
||||
ASSERT_LE(senv.random_read_counter_.Read(), 447);
|
||||
|
||||
CloseDb();
|
||||
}
|
||||
|
||||
TEST_F(CorruptionTest, TableFileIndexData) {
|
||||
Options options;
|
||||
// very big, we'll trigger flushes manually
|
||||
|
|
|
@ -4112,7 +4112,7 @@ Status DBImpl::CreateColumnFamilyWithImport(
|
|||
return status;
|
||||
}
|
||||
|
||||
Status DBImpl::VerifyChecksum() {
|
||||
Status DBImpl::VerifyChecksum(const ReadOptions& read_options) {
|
||||
Status s;
|
||||
std::vector<ColumnFamilyData*> cfd_list;
|
||||
{
|
||||
|
@ -4143,7 +4143,8 @@ Status DBImpl::VerifyChecksum() {
|
|||
const auto& fd = vstorage->LevelFilesBrief(i).files[j].fd;
|
||||
std::string fname = TableFileName(cfd->ioptions()->cf_paths,
|
||||
fd.GetNumber(), fd.GetPathId());
|
||||
s = rocksdb::VerifySstFileChecksum(opts, env_options_, fname);
|
||||
s = rocksdb::VerifySstFileChecksum(opts, env_options_, read_options,
|
||||
fname);
|
||||
}
|
||||
}
|
||||
if (!s.ok()) {
|
||||
|
|
|
@ -380,7 +380,8 @@ class DBImpl : public DB {
|
|||
const ExportImportFilesMetaData& metadata,
|
||||
ColumnFamilyHandle** handle) override;
|
||||
|
||||
virtual Status VerifyChecksum() override;
|
||||
using DB::VerifyChecksum;
|
||||
virtual Status VerifyChecksum(const ReadOptions& /*read_options*/) override;
|
||||
|
||||
using DB::StartTrace;
|
||||
virtual Status StartTrace(
|
||||
|
|
|
@ -2585,7 +2585,8 @@ class ModelDB : public DB {
|
|||
return Status::NotSupported("Not implemented.");
|
||||
}
|
||||
|
||||
Status VerifyChecksum() override {
|
||||
using DB::VerifyChecksum;
|
||||
Status VerifyChecksum(const ReadOptions&) override {
|
||||
return Status::NotSupported("Not implemented.");
|
||||
}
|
||||
|
||||
|
|
|
@ -347,8 +347,11 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
|
|||
}
|
||||
|
||||
if (ingestion_options_.verify_checksums_before_ingest) {
|
||||
status =
|
||||
table_reader->VerifyChecksum(TableReaderCaller::kExternalSSTIngestion);
|
||||
// If customized readahead size is needed, we can pass a user option
|
||||
// all the way to here. Right now we just rely on the default readahead
|
||||
// to keep things simple.
|
||||
status = table_reader->VerifyChecksum(
|
||||
ReadOptions(), TableReaderCaller::kExternalSSTIngestion);
|
||||
}
|
||||
if (!status.ok()) {
|
||||
return status;
|
||||
|
|
|
@ -339,6 +339,13 @@ Status DeleteFilesInRanges(DB* db, ColumnFamilyHandle* column_family,
|
|||
Status VerifySstFileChecksum(const Options& options,
|
||||
const EnvOptions& env_options,
|
||||
const std::string& file_path);
|
||||
|
||||
// Verify the checksum of file
|
||||
Status VerifySstFileChecksum(const Options& options,
|
||||
const EnvOptions& env_options,
|
||||
const ReadOptions& read_options,
|
||||
const std::string& file_path);
|
||||
|
||||
#endif // ROCKSDB_LITE
|
||||
|
||||
} // namespace rocksdb
|
||||
|
|
|
@ -1225,7 +1225,9 @@ class DB {
|
|||
const ExportImportFilesMetaData& metadata,
|
||||
ColumnFamilyHandle** handle) = 0;
|
||||
|
||||
virtual Status VerifyChecksum() = 0;
|
||||
virtual Status VerifyChecksum(const ReadOptions& read_options) = 0;
|
||||
|
||||
virtual Status VerifyChecksum() { return VerifyChecksum(ReadOptions()); }
|
||||
|
||||
// AddFile() is deprecated, please use IngestExternalFile()
|
||||
ROCKSDB_DEPRECATED_FUNC virtual Status AddFile(
|
||||
|
|
|
@ -33,7 +33,9 @@ class SstFileReader {
|
|||
std::shared_ptr<const TableProperties> GetTableProperties() const;
|
||||
|
||||
// Verifies whether there is corruption in this table.
|
||||
Status VerifyChecksum();
|
||||
Status VerifyChecksum(const ReadOptions& /*read_options*/);
|
||||
|
||||
Status VerifyChecksum() { return VerifyChecksum(ReadOptions()); }
|
||||
|
||||
private:
|
||||
struct Rep;
|
||||
|
|
|
@ -143,6 +143,10 @@ class StackableDB : public DB {
|
|||
|
||||
virtual Status VerifyChecksum() override { return db_->VerifyChecksum(); }
|
||||
|
||||
virtual Status VerifyChecksum(const ReadOptions& options) override {
|
||||
return db_->VerifyChecksum(options);
|
||||
}
|
||||
|
||||
using DB::KeyMayExist;
|
||||
virtual bool KeyMayExist(const ReadOptions& options,
|
||||
ColumnFamilyHandle* column_family, const Slice& key,
|
||||
|
|
|
@ -62,6 +62,10 @@ extern const std::string kHashIndexPrefixesMetadataBlock;
|
|||
|
||||
typedef BlockBasedTable::IndexReader IndexReader;
|
||||
|
||||
// Found that 256 KB readahead size provides the best performance, based on
|
||||
// experiments, for auto readahead. Experiment data is in PR #3282.
|
||||
const size_t BlockBasedTable::kMaxAutoReadaheadSize = 256 * 1024;
|
||||
|
||||
BlockBasedTable::~BlockBasedTable() {
|
||||
delete rep_;
|
||||
}
|
||||
|
@ -2854,13 +2858,6 @@ void BlockBasedTableIterator<TBlockIter, TValue>::Prev() {
|
|||
FindKeyBackward();
|
||||
}
|
||||
|
||||
// Found that 256 KB readahead size provides the best performance, based on
|
||||
// experiments, for auto readahead. Experiment data is in PR #3282.
|
||||
template <class TBlockIter, typename TValue>
|
||||
const size_t
|
||||
BlockBasedTableIterator<TBlockIter, TValue>::kMaxAutoReadaheadSize =
|
||||
256 * 1024;
|
||||
|
||||
template <class TBlockIter, typename TValue>
|
||||
void BlockBasedTableIterator<TBlockIter, TValue>::InitDataBlock() {
|
||||
BlockHandle data_block_handle = index_iter_->value().handle;
|
||||
|
@ -2883,7 +2880,8 @@ void BlockBasedTableIterator<TBlockIter, TValue>::InitDataBlock() {
|
|||
if (read_options_.readahead_size == 0) {
|
||||
// Implicit auto readahead
|
||||
num_file_reads_++;
|
||||
if (num_file_reads_ > kMinNumFileReadsToStartAutoReadahead) {
|
||||
if (num_file_reads_ >
|
||||
BlockBasedTable::kMinNumFileReadsToStartAutoReadahead) {
|
||||
if (!rep->file->use_direct_io() &&
|
||||
(data_block_handle.offset() +
|
||||
static_cast<size_t>(data_block_handle.size()) +
|
||||
|
@ -2897,14 +2895,14 @@ void BlockBasedTableIterator<TBlockIter, TValue>::InitDataBlock() {
|
|||
readahead_size_);
|
||||
// Keep exponentially increasing readahead size until
|
||||
// kMaxAutoReadaheadSize.
|
||||
readahead_size_ =
|
||||
std::min(kMaxAutoReadaheadSize, readahead_size_ * 2);
|
||||
readahead_size_ = std::min(BlockBasedTable::kMaxAutoReadaheadSize,
|
||||
readahead_size_ * 2);
|
||||
} else if (rep->file->use_direct_io() && !prefetch_buffer_) {
|
||||
// Direct I/O
|
||||
// Let FilePrefetchBuffer take care of the readahead.
|
||||
prefetch_buffer_.reset(
|
||||
new FilePrefetchBuffer(rep->file.get(), kInitAutoReadaheadSize,
|
||||
kMaxAutoReadaheadSize));
|
||||
prefetch_buffer_.reset(new FilePrefetchBuffer(
|
||||
rep->file.get(), BlockBasedTable::kInitAutoReadaheadSize,
|
||||
BlockBasedTable::kMaxAutoReadaheadSize));
|
||||
}
|
||||
}
|
||||
} else if (!prefetch_buffer_) {
|
||||
|
@ -3707,7 +3705,8 @@ Status BlockBasedTable::Prefetch(const Slice* const begin,
|
|||
return Status::OK();
|
||||
}
|
||||
|
||||
Status BlockBasedTable::VerifyChecksum(TableReaderCaller caller) {
|
||||
Status BlockBasedTable::VerifyChecksum(const ReadOptions& read_options,
|
||||
TableReaderCaller caller) {
|
||||
Status s;
|
||||
// Check Meta blocks
|
||||
std::unique_ptr<Block> meta;
|
||||
|
@ -3725,7 +3724,7 @@ Status BlockBasedTable::VerifyChecksum(TableReaderCaller caller) {
|
|||
IndexBlockIter iiter_on_stack;
|
||||
BlockCacheLookupContext context{caller};
|
||||
InternalIteratorBase<IndexValue>* iiter = NewIndexIterator(
|
||||
ReadOptions(), /*need_upper_bound_check=*/false, &iiter_on_stack,
|
||||
read_options, /*disable_prefix_seek=*/false, &iiter_on_stack,
|
||||
/*get_context=*/nullptr, &context);
|
||||
std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
|
||||
if (iiter != &iiter_on_stack) {
|
||||
|
@ -3735,13 +3734,22 @@ Status BlockBasedTable::VerifyChecksum(TableReaderCaller caller) {
|
|||
// error opening index iterator
|
||||
return iiter->status();
|
||||
}
|
||||
s = VerifyChecksumInBlocks(iiter);
|
||||
s = VerifyChecksumInBlocks(read_options, iiter);
|
||||
return s;
|
||||
}
|
||||
|
||||
Status BlockBasedTable::VerifyChecksumInBlocks(
|
||||
const ReadOptions& read_options,
|
||||
InternalIteratorBase<IndexValue>* index_iter) {
|
||||
Status s;
|
||||
// We are scanning the whole file, so no need to do exponential
|
||||
// increasing of the buffer size.
|
||||
size_t readahead_size = (read_options.readahead_size != 0)
|
||||
? read_options.readahead_size
|
||||
: kMaxAutoReadaheadSize;
|
||||
FilePrefetchBuffer prefetch_buffer(rep_->file.get(), readahead_size,
|
||||
readahead_size);
|
||||
|
||||
for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) {
|
||||
s = index_iter->status();
|
||||
if (!s.ok()) {
|
||||
|
@ -3750,9 +3758,9 @@ Status BlockBasedTable::VerifyChecksumInBlocks(
|
|||
BlockHandle handle = index_iter->value().handle;
|
||||
BlockContents contents;
|
||||
BlockFetcher block_fetcher(
|
||||
rep_->file.get(), nullptr /* prefetch buffer */, rep_->footer,
|
||||
ReadOptions(), handle, &contents, rep_->ioptions,
|
||||
false /* decompress */, false /*maybe_compressed*/, BlockType::kData,
|
||||
rep_->file.get(), &prefetch_buffer, rep_->footer, ReadOptions(), handle,
|
||||
&contents, rep_->ioptions, false /* decompress */,
|
||||
false /*maybe_compressed*/, BlockType::kData,
|
||||
UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options);
|
||||
s = block_fetcher.ReadBlockContents();
|
||||
if (!s.ok()) {
|
||||
|
|
|
@ -82,6 +82,13 @@ class BlockBasedTable : public TableReader {
|
|||
// For Posix files the unique ID is three varints.
|
||||
static const size_t kMaxCacheKeyPrefixSize = kMaxVarint64Length * 3 + 1;
|
||||
|
||||
// All the below fields control iterator readahead
|
||||
static const size_t kInitAutoReadaheadSize = 8 * 1024;
|
||||
// Found that 256 KB readahead size provides the best performance, based on
|
||||
// experiments, for auto readahead. Experiment data is in PR #3282.
|
||||
static const size_t kMaxAutoReadaheadSize;
|
||||
static const int kMinNumFileReadsToStartAutoReadahead = 2;
|
||||
|
||||
// Attempt to open the table that is stored in bytes [0..file_size)
|
||||
// of "file", and read the metadata entries necessary to allow
|
||||
// retrieving data from the table.
|
||||
|
@ -182,7 +189,8 @@ class BlockBasedTable : public TableReader {
|
|||
// convert SST file to a human readable form
|
||||
Status DumpTable(WritableFile* out_file) override;
|
||||
|
||||
Status VerifyChecksum(TableReaderCaller caller) override;
|
||||
Status VerifyChecksum(const ReadOptions& readOptions,
|
||||
TableReaderCaller caller) override;
|
||||
|
||||
~BlockBasedTable();
|
||||
|
||||
|
@ -430,7 +438,8 @@ class BlockBasedTable : public TableReader {
|
|||
static BlockType GetBlockTypeForMetaBlockByName(const Slice& meta_block_name);
|
||||
|
||||
Status VerifyChecksumInMetaBlocks(InternalIteratorBase<Slice>* index_iter);
|
||||
Status VerifyChecksumInBlocks(InternalIteratorBase<IndexValue>* index_iter);
|
||||
Status VerifyChecksumInBlocks(const ReadOptions& read_options,
|
||||
InternalIteratorBase<IndexValue>* index_iter);
|
||||
|
||||
// Create the filter from the filter block.
|
||||
std::unique_ptr<FilterBlockReader> CreateFilterBlockReader(
|
||||
|
@ -772,13 +781,7 @@ class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
|
|||
// lookup_context_.caller = kCompaction.
|
||||
size_t compaction_readahead_size_;
|
||||
|
||||
// All the below fields control iterator readahead
|
||||
static const size_t kInitAutoReadaheadSize = 8 * 1024;
|
||||
// Found that 256 KB readahead size provides the best performance, based on
|
||||
// experiments, for auto readahead. Experiment data is in PR #3282.
|
||||
static const size_t kMaxAutoReadaheadSize;
|
||||
static const int kMinNumFileReadsToStartAutoReadahead = 2;
|
||||
size_t readahead_size_ = kInitAutoReadaheadSize;
|
||||
size_t readahead_size_ = BlockBasedTable::kInitAutoReadaheadSize;
|
||||
size_t readahead_limit_ = 0;
|
||||
int64_t num_file_reads_ = 0;
|
||||
std::unique_ptr<FilePrefetchBuffer> prefetch_buffer_;
|
||||
|
|
|
@ -79,8 +79,9 @@ std::shared_ptr<const TableProperties> SstFileReader::GetTableProperties()
|
|||
return rep_->table_reader->GetTableProperties();
|
||||
}
|
||||
|
||||
Status SstFileReader::VerifyChecksum() {
|
||||
return rep_->table_reader->VerifyChecksum(TableReaderCaller::kSSTFileReader);
|
||||
Status SstFileReader::VerifyChecksum(const ReadOptions& read_options) {
|
||||
return rep_->table_reader->VerifyChecksum(read_options,
|
||||
TableReaderCaller::kSSTFileReader);
|
||||
}
|
||||
|
||||
} // namespace rocksdb
|
||||
|
|
|
@ -127,7 +127,8 @@ class TableReader {
|
|||
}
|
||||
|
||||
// check whether there is corruption in this db file
|
||||
virtual Status VerifyChecksum(TableReaderCaller /*caller*/) {
|
||||
virtual Status VerifyChecksum(const ReadOptions& /*read_options*/,
|
||||
TableReaderCaller /*caller*/) {
|
||||
return Status::NotSupported("VerifyChecksum() not supported");
|
||||
}
|
||||
};
|
||||
|
|
|
@ -143,7 +143,9 @@ Status SstFileDumper::NewTableReader(
|
|||
}
|
||||
|
||||
Status SstFileDumper::VerifyChecksum() {
|
||||
return table_reader_->VerifyChecksum(TableReaderCaller::kSSTDumpTool);
|
||||
// We could pass specific readahead setting into read options if needed.
|
||||
return table_reader_->VerifyChecksum(ReadOptions(),
|
||||
TableReaderCaller::kSSTDumpTool);
|
||||
}
|
||||
|
||||
Status SstFileDumper::DumpTable(const std::string& out_filename) {
|
||||
|
|
Loading…
Reference in New Issue