mirror of https://github.com/facebook/rocksdb.git
Handle failures in block-based table size/offset approximation (#9615)
Summary: In crash test with fault injection, we were seeing stack traces like the following: ``` https://github.com/facebook/rocksdb/issues/3 0x00007f75f763c533 in __GI___assert_fail (assertion=assertion@entry=0x1c5b2a0 "end_offset >= start_offset", file=file@entry=0x1c580a0 "table/block_based/block_based_table_reader.cc", line=line@entry=3245, function=function@entry=0x1c60e60 "virtual uint64_t rocksdb::BlockBasedTable::ApproximateSize(const rocksdb::Slice&, const rocksdb::Slice&, rocksdb::TableReaderCaller)") at assert.c:101 https://github.com/facebook/rocksdb/issues/4 0x00000000010ea9b4 in rocksdb::BlockBasedTable::ApproximateSize (this=<optimized out>, start=..., end=..., caller=<optimized out>) at table/block_based/block_based_table_reader.cc:3224 https://github.com/facebook/rocksdb/issues/5 0x0000000000be61fb in rocksdb::TableCache::ApproximateSize (this=0x60f0000161b0, start=..., end=..., fd=..., caller=caller@entry=rocksdb::kCompaction, internal_comparator=..., prefix_extractor=...) at db/table_cache.cc:719 https://github.com/facebook/rocksdb/issues/6 0x0000000000c3eaec in rocksdb::VersionSet::ApproximateSize (this=<optimized out>, v=<optimized out>, f=..., start=..., end=..., caller=<optimized out>) at ./db/version_set.h:850 https://github.com/facebook/rocksdb/issues/7 0x0000000000c6ebc3 in rocksdb::VersionSet::ApproximateSize (this=<optimized out>, options=..., v=v@entry=0x621000047500, start=..., end=..., start_level=start_level@entry=0, end_level=<optimized out>, caller=<optimized out>) at db/version_set.cc:5657 https://github.com/facebook/rocksdb/issues/8 0x000000000166e894 in rocksdb::CompactionJob::GenSubcompactionBoundaries (this=<optimized out>) at ./include/rocksdb/options.h:1869 https://github.com/facebook/rocksdb/issues/9 0x000000000168c526 in rocksdb::CompactionJob::Prepare (this=this@entry=0x7f75f3ffcf00) at db/compaction/compaction_job.cc:546 ``` The problem occurred in `ApproximateSize()` when the index `Seek()` for the first `ApproximateDataOffsetOf()` encountered an I/O error, while the second `Seek()` did not. In the old code that scenario caused `start_offset == data_size` , thus it was easy to trip the assertion that `end_offset >= start_offset`. The fix is to set `start_offset == 0` when the first index `Seek()` fails, and `end_offset == data_size` when the second index `Seek()` fails. I doubt these give an "on average correct" answer for how this function is used, but I/O errors in index seeks are hopefully rare, it looked consistent with what was already there, and it was easier to calculate. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9615 Test Plan: run the repro command for a while and stopped seeing coredumps - ``` $ while ! ./db_stress --block_size=128 --cache_size=32768 --clear_column_family_one_in=0 --column_families=1 --continuous_verification_interval=0 --db=/dev/shm/rocksdb_crashtest --delpercent=4 --delrangepercent=1 --destroy_db_initially=0 --expected_values_dir=/dev/shm/rocksdb_crashtest_expected --index_type=2 --iterpercent=10 --kill_random_test=18887 --max_key=1000000 --max_bytes_for_level_base=2048576 --nooverwritepercent=1 --open_files=-1 --open_read_fault_one_in=32 --ops_per_thread=1000000 --prefixpercent=5 --read_fault_one_in=0 --readpercent=45 --reopen=0 --skip_verifydb=1 --subcompactions=2 --target_file_size_base=524288 --test_batches_snapshots=0 --value_size_mult=32 --write_buffer_size=524288 --writepercent=35 ; do : ; done ``` Reviewed By: pdillinger Differential Revision: D34383069 Pulled By: ajkr fbshipit-source-id: fac26c3b20ea962e75387515ba5f2724dc48719f
This commit is contained in:
parent
ddb7620a61
commit
0a89cea5f5
|
@ -3158,6 +3158,7 @@ Status BlockBasedTable::CreateIndexReader(
|
|||
uint64_t BlockBasedTable::ApproximateDataOffsetOf(
|
||||
const InternalIteratorBase<IndexValue>& index_iter,
|
||||
uint64_t data_size) const {
|
||||
assert(index_iter.status().ok());
|
||||
if (index_iter.Valid()) {
|
||||
BlockHandle handle = index_iter.value().handle;
|
||||
return handle.offset();
|
||||
|
@ -3200,8 +3201,16 @@ uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key,
|
|||
}
|
||||
|
||||
index_iter->Seek(key);
|
||||
uint64_t offset;
|
||||
if (index_iter->status().ok()) {
|
||||
offset = ApproximateDataOffsetOf(*index_iter, data_size);
|
||||
} else {
|
||||
// Split in half to avoid skewing one way or another,
|
||||
// since we don't know whether we're operating on lower bound or
|
||||
// upper bound.
|
||||
return rep_->file_size / 2;
|
||||
}
|
||||
|
||||
uint64_t offset = ApproximateDataOffsetOf(*index_iter, data_size);
|
||||
// Pro-rate file metadata (incl filters) size-proportionally across data
|
||||
// blocks.
|
||||
double size_ratio =
|
||||
|
@ -3217,7 +3226,9 @@ uint64_t BlockBasedTable::ApproximateSize(const Slice& start, const Slice& end,
|
|||
uint64_t data_size = GetApproximateDataSize();
|
||||
if (UNLIKELY(data_size == 0)) {
|
||||
// Hmm. Assume whole file is involved, since we have lower and upper
|
||||
// bound.
|
||||
// bound. This likely skews the estimate if we consider that this function
|
||||
// is typically called with `[start, end]` fully contained in the file's
|
||||
// key-range.
|
||||
return rep_->file_size;
|
||||
}
|
||||
|
||||
|
@ -3235,9 +3246,24 @@ uint64_t BlockBasedTable::ApproximateSize(const Slice& start, const Slice& end,
|
|||
}
|
||||
|
||||
index_iter->Seek(start);
|
||||
uint64_t start_offset = ApproximateDataOffsetOf(*index_iter, data_size);
|
||||
uint64_t start_offset;
|
||||
if (index_iter->status().ok()) {
|
||||
start_offset = ApproximateDataOffsetOf(*index_iter, data_size);
|
||||
} else {
|
||||
// Assume file is involved from the start. This likely skews the estimate
|
||||
// but is consistent with the above error handling.
|
||||
start_offset = 0;
|
||||
}
|
||||
|
||||
index_iter->Seek(end);
|
||||
uint64_t end_offset = ApproximateDataOffsetOf(*index_iter, data_size);
|
||||
uint64_t end_offset;
|
||||
if (index_iter->status().ok()) {
|
||||
end_offset = ApproximateDataOffsetOf(*index_iter, data_size);
|
||||
} else {
|
||||
// Assume file is involved until the end. This likely skews the estimate
|
||||
// but is consistent with the above error handling.
|
||||
end_offset = data_size;
|
||||
}
|
||||
|
||||
assert(end_offset >= start_offset);
|
||||
// Pro-rate file metadata (incl filters) size-proportionally across data
|
||||
|
|
Loading…
Reference in New Issue