mirror of https://github.com/facebook/rocksdb.git
Add ticker stats for read corruption retries (#12923)
Summary: Add a couple of ticker stats for corruption retry count and successful retries. This PR also eliminates an extra read attempt when there's a checksum mismatch in a block read from the prefetch buffer. Pull Request resolved: https://github.com/facebook/rocksdb/pull/12923 Test Plan: Update existing tests Reviewed By: jowlyzhang Differential Revision: D61024687 Pulled By: anand1976 fbshipit-source-id: 3a08403580ab244000e0d480b7ee0f5a03d76b06
This commit is contained in:
parent
b65e29a4a9
commit
c21fe1a47f
|
@ -530,6 +530,12 @@ Status DBImpl::Recover(
|
|||
/*no_error_if_files_missing=*/false, is_retry,
|
||||
&desc_status);
|
||||
desc_status.PermitUncheckedError();
|
||||
if (is_retry) {
|
||||
RecordTick(stats_, FILE_READ_CORRUPTION_RETRY_COUNT);
|
||||
if (desc_status.ok()) {
|
||||
RecordTick(stats_, FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT);
|
||||
}
|
||||
}
|
||||
if (can_retry) {
|
||||
// If we're opening for the first time and the failure is likely due to
|
||||
// a corrupt MANIFEST file (could result in either the log::Reader
|
||||
|
|
|
@ -705,6 +705,7 @@ class DBIOCorruptionTest
|
|||
DBIOCorruptionTest() : DBIOFailureTest() {
|
||||
BlockBasedTableOptions bbto;
|
||||
options_ = CurrentOptions();
|
||||
options_.statistics = CreateDBStatistics();
|
||||
|
||||
base_env_ = env_;
|
||||
EXPECT_NE(base_env_, nullptr);
|
||||
|
@ -727,6 +728,8 @@ class DBIOCorruptionTest
|
|||
|
||||
Status ReopenDB() { return TryReopen(options_); }
|
||||
|
||||
Statistics* stats() { return options_.statistics.get(); }
|
||||
|
||||
protected:
|
||||
std::unique_ptr<Env> env_guard_;
|
||||
std::shared_ptr<CorruptionFS> fs_;
|
||||
|
@ -749,8 +752,12 @@ TEST_P(DBIOCorruptionTest, GetReadCorruptionRetry) {
|
|||
if (std::get<2>(GetParam())) {
|
||||
ASSERT_OK(s);
|
||||
ASSERT_EQ(val, "val1");
|
||||
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 1);
|
||||
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT),
|
||||
1);
|
||||
} else {
|
||||
ASSERT_TRUE(s.IsCorruption());
|
||||
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 0);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -773,8 +780,12 @@ TEST_P(DBIOCorruptionTest, IterReadCorruptionRetry) {
|
|||
}
|
||||
if (std::get<2>(GetParam())) {
|
||||
ASSERT_OK(iter->status());
|
||||
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 1);
|
||||
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT),
|
||||
1);
|
||||
} else {
|
||||
ASSERT_TRUE(iter->status().IsCorruption());
|
||||
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 0);
|
||||
}
|
||||
delete iter;
|
||||
}
|
||||
|
@ -799,9 +810,13 @@ TEST_P(DBIOCorruptionTest, MultiGetReadCorruptionRetry) {
|
|||
if (std::get<2>(GetParam())) {
|
||||
ASSERT_EQ(values[0].ToString(), "val1");
|
||||
ASSERT_EQ(values[1].ToString(), "val2");
|
||||
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 1);
|
||||
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT),
|
||||
1);
|
||||
} else {
|
||||
ASSERT_TRUE(statuses[0].IsCorruption());
|
||||
ASSERT_TRUE(statuses[1].IsCorruption());
|
||||
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 0);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -818,6 +833,9 @@ TEST_P(DBIOCorruptionTest, CompactionReadCorruptionRetry) {
|
|||
Status s = dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
|
||||
if (std::get<2>(GetParam())) {
|
||||
ASSERT_OK(s);
|
||||
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 1);
|
||||
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT),
|
||||
1);
|
||||
|
||||
std::string val;
|
||||
ReadOptions ro;
|
||||
|
@ -826,6 +844,7 @@ TEST_P(DBIOCorruptionTest, CompactionReadCorruptionRetry) {
|
|||
ASSERT_EQ(val, "val1");
|
||||
} else {
|
||||
ASSERT_TRUE(s.IsCorruption());
|
||||
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 0);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -838,6 +857,9 @@ TEST_P(DBIOCorruptionTest, FlushReadCorruptionRetry) {
|
|||
Status s = Flush();
|
||||
if (std::get<2>(GetParam())) {
|
||||
ASSERT_OK(s);
|
||||
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 1);
|
||||
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT),
|
||||
1);
|
||||
|
||||
std::string val;
|
||||
ReadOptions ro;
|
||||
|
@ -846,6 +868,7 @@ TEST_P(DBIOCorruptionTest, FlushReadCorruptionRetry) {
|
|||
ASSERT_EQ(val, "val1");
|
||||
} else {
|
||||
ASSERT_NOK(s);
|
||||
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 0);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -862,8 +885,12 @@ TEST_P(DBIOCorruptionTest, ManifestCorruptionRetry) {
|
|||
|
||||
if (std::get<2>(GetParam())) {
|
||||
ASSERT_OK(ReopenDB());
|
||||
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 1);
|
||||
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT),
|
||||
1);
|
||||
} else {
|
||||
ASSERT_EQ(ReopenDB(), Status::Corruption());
|
||||
ASSERT_EQ(stats()->getTickerCount(FILE_READ_CORRUPTION_RETRY_COUNT), 0);
|
||||
}
|
||||
SyncPoint::GetInstance()->DisableProcessing();
|
||||
}
|
||||
|
|
|
@ -529,6 +529,11 @@ enum Tickers : uint32_t {
|
|||
// Footer corruption detected when opening an SST file for reading
|
||||
SST_FOOTER_CORRUPTION_COUNT,
|
||||
|
||||
// Counters for file read retries with the verify_and_reconstruct_read
|
||||
// file system option after detecting a checksum mismatch
|
||||
FILE_READ_CORRUPTION_RETRY_COUNT,
|
||||
FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT,
|
||||
|
||||
TICKER_ENUM_MAX
|
||||
};
|
||||
|
||||
|
|
|
@ -5269,6 +5269,10 @@ class TickerTypeJni {
|
|||
return -0x53;
|
||||
case ROCKSDB_NAMESPACE::Tickers::SST_FOOTER_CORRUPTION_COUNT:
|
||||
return -0x55;
|
||||
case ROCKSDB_NAMESPACE::Tickers::FILE_READ_CORRUPTION_RETRY_COUNT:
|
||||
return -0x56;
|
||||
case ROCKSDB_NAMESPACE::Tickers::FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT:
|
||||
return -0x57;
|
||||
case ROCKSDB_NAMESPACE::Tickers::TICKER_ENUM_MAX:
|
||||
// -0x54 is the max value at this time. Since these values are exposed
|
||||
// directly to Java clients, we'll keep the value the same till the next
|
||||
|
@ -5726,6 +5730,11 @@ class TickerTypeJni {
|
|||
return ROCKSDB_NAMESPACE::Tickers::PREFETCH_HITS;
|
||||
case -0x55:
|
||||
return ROCKSDB_NAMESPACE::Tickers::SST_FOOTER_CORRUPTION_COUNT;
|
||||
case -0x56:
|
||||
return ROCKSDB_NAMESPACE::Tickers::FILE_READ_CORRUPTION_RETRY_COUNT;
|
||||
case -0x57:
|
||||
return ROCKSDB_NAMESPACE::Tickers::
|
||||
FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT;
|
||||
case -0x54:
|
||||
// -0x54 is the max value at this time. Since these values are exposed
|
||||
// directly to Java clients, we'll keep the value the same till the next
|
||||
|
|
|
@ -878,6 +878,10 @@ public enum TickerType {
|
|||
|
||||
SST_FOOTER_CORRUPTION_COUNT((byte) -0x55),
|
||||
|
||||
FILE_READ_CORRUPTION_RETRY_COUNT((byte) -0x56),
|
||||
|
||||
FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT((byte) -0x57),
|
||||
|
||||
TICKER_ENUM_MAX((byte) -0x54);
|
||||
|
||||
private final byte value;
|
||||
|
|
|
@ -266,6 +266,10 @@ const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
|
|||
{PREFETCH_BYTES_USEFUL, "rocksdb.prefetch.bytes.useful"},
|
||||
{PREFETCH_HITS, "rocksdb.prefetch.hits"},
|
||||
{SST_FOOTER_CORRUPTION_COUNT, "rocksdb.footer.corruption.count"},
|
||||
{FILE_READ_CORRUPTION_RETRY_COUNT,
|
||||
"rocksdb.file.read.corruption.retry.count"},
|
||||
{FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT,
|
||||
"rocksdb.file.read.corruption.retry.success.count"},
|
||||
};
|
||||
|
||||
const std::vector<std::pair<Histograms, std::string>> HistogramsNameMap = {
|
||||
|
|
|
@ -693,6 +693,10 @@ Status BlockBasedTable::Open(
|
|||
s = ReadFooterFromFile(retry_opts, file.get(), *ioptions.fs,
|
||||
prefetch_buffer.get(), file_size, &footer,
|
||||
kBlockBasedTableMagicNumber);
|
||||
RecordTick(ioptions.stats, FILE_READ_CORRUPTION_RETRY_COUNT);
|
||||
if (s.ok()) {
|
||||
RecordTick(ioptions.stats, FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!s.ok()) {
|
||||
|
|
|
@ -223,13 +223,16 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::RetrieveMultipleBlocks)
|
|||
s = VerifyBlockChecksum(footer, data, handle.size(),
|
||||
rep_->file->file_name(), handle.offset());
|
||||
RecordTick(ioptions.stats, BLOCK_CHECKSUM_COMPUTE_COUNT);
|
||||
if (!s.ok()) {
|
||||
RecordTick(ioptions.stats, BLOCK_CHECKSUM_MISMATCH_COUNT);
|
||||
}
|
||||
TEST_SYNC_POINT_CALLBACK("RetrieveMultipleBlocks:VerifyChecksum", &s);
|
||||
if (!s.ok() &&
|
||||
CheckFSFeatureSupport(ioptions.fs.get(),
|
||||
FSSupportedOps::kVerifyAndReconstructRead)) {
|
||||
assert(s.IsCorruption());
|
||||
assert(!ioptions.allow_mmap_reads);
|
||||
RecordTick(ioptions.stats, BLOCK_CHECKSUM_MISMATCH_COUNT);
|
||||
RecordTick(ioptions.stats, FILE_READ_CORRUPTION_RETRY_COUNT);
|
||||
|
||||
// Repeat the read for this particular block using the regular
|
||||
// synchronous Read API. We can use the same chunk of memory
|
||||
|
@ -246,6 +249,10 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::RetrieveMultipleBlocks)
|
|||
assert(result.size() == BlockSizeWithTrailer(handle));
|
||||
s = VerifyBlockChecksum(footer, data, handle.size(),
|
||||
rep_->file->file_name(), handle.offset());
|
||||
if (s.ok()) {
|
||||
RecordTick(ioptions.stats,
|
||||
FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT);
|
||||
}
|
||||
} else {
|
||||
s = io_s;
|
||||
}
|
||||
|
|
|
@ -84,7 +84,9 @@ inline bool BlockFetcher::TryGetFromPrefetchBuffer() {
|
|||
if (io_status_.ok()) {
|
||||
got_from_prefetch_buffer_ = true;
|
||||
used_buf_ = const_cast<char*>(slice_.data());
|
||||
} else if (!(io_status_.IsCorruption() && retry_corrupt_read_)) {
|
||||
} else if (io_status_.IsCorruption()) {
|
||||
// Returning true apparently indicates we either got some data from
|
||||
// the prefetch buffer, or we tried and encountered an error.
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
@ -334,9 +336,15 @@ void BlockFetcher::ReadBlock(bool retry) {
|
|||
ProcessTrailerIfPresent();
|
||||
}
|
||||
|
||||
if (retry) {
|
||||
RecordTick(ioptions_.stats, FILE_READ_CORRUPTION_RETRY_COUNT);
|
||||
}
|
||||
if (io_status_.ok()) {
|
||||
InsertCompressedBlockToPersistentCacheIfNeeded();
|
||||
fs_buf_ = std::move(read_req.fs_scratch);
|
||||
if (retry) {
|
||||
RecordTick(ioptions_.stats, FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT);
|
||||
}
|
||||
} else {
|
||||
ReleaseFileSystemProvidedBuffer(&read_req);
|
||||
direct_io_buf_.reset();
|
||||
|
@ -355,7 +363,11 @@ IOStatus BlockFetcher::ReadBlockContents() {
|
|||
return IOStatus::OK();
|
||||
}
|
||||
if (TryGetFromPrefetchBuffer()) {
|
||||
if (io_status_.IsCorruption() && retry_corrupt_read_) {
|
||||
ReadBlock(/*retry=*/true);
|
||||
}
|
||||
if (!io_status_.ok()) {
|
||||
assert(!fs_buf_);
|
||||
return io_status_;
|
||||
}
|
||||
} else if (!TryGetSerializedBlockFromPersistentCache()) {
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
Add ticker stats to count file read retries due to checksum mismatch
|
Loading…
Reference in New Issue