mirror of
https://github.com/facebook/rocksdb.git
synced 2024-11-29 09:36:17 +00:00
39455974cb
Summary: Not sure where or how it happens, but using a recent CircleCI failure I got a reliable db_stress reproducer. Using std::unique_ptr appropriately for managing them has apparently (and unsurprisingly) fixed the problem without needing to know exactly where the problem was. Suggested follow-up: * Three or even four levels of pointers is very confusing to work with. Surely this part can be cleaned up to be simpler. Pull Request resolved: https://github.com/facebook/rocksdb/pull/12805 Test Plan: Reproducer passes, plus ASAN test and crash test runs. I don't think it's worth the extra work to track down the details and create a careful unit test. ``` ./db_stress --WAL_size_limit_MB=1 --WAL_ttl_seconds=60 --acquire_snapshot_one_in=10000 --adaptive_readahead=1 --adm_policy=2 --advise_random_on_open=1 --allow_data_in_errors=True --allow_fallocate=1 --async_io=0 --auto_readahead_size=1 --avoid_flush_during_recovery=0 --avoid_flush_during_shutdown=1 --avoid_unnecessary_blocking_io=1 --backup_max_size=104857600 --backup_one_in=100000 --batch_protection_bytes_per_key=0 --bgerror_resume_retry_interval=1000000 --block_align=1 --block_protection_bytes_per_key=4 --block_size=16384 --bloom_before_level=2147483646 --bloom_bits=15 --bottommost_compression_type=none --bottommost_file_compaction_delay=3600 --bytes_per_sync=262144 --cache_index_and_filter_blocks=0 --cache_index_and_filter_blocks_with_high_priority=0 --cache_size=33554432 --cache_type=tiered_lru_cache --charge_compression_dictionary_building_buffer=0 --charge_file_metadata=1 --charge_filter_construction=0 --charge_table_reader=0 --check_multiget_consistency=1 --check_multiget_entity_consistency=1 --checkpoint_one_in=10000 --checksum_type=kxxHash --clear_column_family_one_in=0 --compact_files_one_in=1000000 --compact_range_one_in=1000 --compaction_pri=0 --compaction_readahead_size=0 --compaction_ttl=0 --compress_format_version=2 --compressed_secondary_cache_ratio=0.2 --compressed_secondary_cache_size=0 --compression_checksum=0 --compression_max_dict_buffer_bytes=0 --compression_max_dict_bytes=0 --compression_parallel_threads=1 --compression_type=none --compression_use_zstd_dict_trainer=0 --compression_zstd_max_train_bytes=0 --continuous_verification_interval=0 --daily_offpeak_time_utc= --data_block_index_type=0 --db=/dev/shm/rocksdb.gpxs/rocksdb_crashtest_blackbox --db_write_buffer_size=0 --default_temperature=kWarm --default_write_temperature=kCold --delete_obsolete_files_period_micros=21600000000 --delpercent=4 --delrangepercent=1 --destroy_db_initially=0 --detect_filter_construct_corruption=0 --disable_file_deletions_one_in=10000 --disable_manual_compaction_one_in=1000000 --disable_wal=0 --dump_malloc_stats=1 --enable_checksum_handoff=1 --enable_compaction_filter=0 --enable_custom_split_merge=0 --enable_do_not_compress_roles=0 --enable_index_compression=0 --enable_memtable_insert_with_hint_prefix_extractor=0 --enable_pipelined_write=1 --enable_sst_partitioner_factory=0 --enable_thread_tracking=1 --enable_write_thread_adaptive_yield=0 --error_recovery_with_no_fault_injection=0 --expected_values_dir=/dev/shm/rocksdb.gpxs/rocksdb_crashtest_expected --fail_if_options_file_error=0 --fifo_allow_compaction=0 --file_checksum_impl=none --fill_cache=1 --flush_one_in=1000000 --format_version=3 --get_all_column_family_metadata_one_in=1000000 --get_current_wal_file_one_in=0 --get_live_files_apis_one_in=10000 --get_properties_of_all_tables_one_in=100000 --get_property_one_in=100000 --get_sorted_wal_files_one_in=0 --hard_pending_compaction_bytes_limit=274877906944 --high_pri_pool_ratio=0 --index_block_restart_interval=4 --index_shortening=0 --index_type=0 --ingest_external_file_one_in=0 --initial_auto_readahead_size=16384 --inplace_update_support=0 --iterpercent=10 --key_len_percent_dist=1,30,69 --key_may_exist_one_in=100 --last_level_temperature=kHot --level_compaction_dynamic_level_bytes=0 --lock_wal_one_in=1000000 --log_file_time_to_roll=0 --log_readahead_size=0 --long_running_snapshots=1 --low_pri_pool_ratio=0 --lowest_used_cache_tier=2 --manifest_preallocation_size=5120 --manual_wal_flush_one_in=1000 --mark_for_compaction_one_file_in=10 --max_auto_readahead_size=16384 --max_background_compactions=20 --max_bytes_for_level_base=10485760 --max_key=2500000 --max_key_len=3 --max_log_file_size=0 --max_manifest_file_size=1073741824 --max_sequential_skip_in_iterations=1 --max_total_wal_size=0 --max_write_batch_group_size_bytes=16 --max_write_buffer_number=3 --max_write_buffer_size_to_maintain=0 --memtable_insert_hint_per_batch=1 --memtable_max_range_deletions=100 --memtable_prefix_bloom_size_ratio=0 --memtable_protection_bytes_per_key=4 --memtable_whole_key_filtering=0 --memtablerep=skip_list --metadata_charge_policy=0 --metadata_read_fault_one_in=32 --metadata_write_fault_one_in=0 --min_write_buffer_number_to_merge=2 --mmap_read=1 --mock_direct_io=False --nooverwritepercent=1 --num_file_reads_for_auto_readahead=0 --open_files=100 --open_metadata_read_fault_one_in=0 --open_metadata_write_fault_one_in=8 --open_read_fault_one_in=0 --open_write_fault_one_in=16 --ops_per_thread=100000000 --optimize_filters_for_hits=1 --optimize_filters_for_memory=0 --optimize_multiget_for_io=1 --paranoid_file_checks=1 --partition_filters=0 --partition_pinning=1 --pause_background_one_in=1000000 --periodic_compaction_seconds=0 --prefix_size=-1 --prefixpercent=0 --prepopulate_block_cache=1 --preserve_internal_time_seconds=60 --progress_reports=0 --promote_l0_one_in=0 --read_amp_bytes_per_bit=0 --read_fault_one_in=32 --readahead_size=524288 --readpercent=50 --recycle_log_file_num=1 --reopen=0 --report_bg_io_stats=1 --reset_stats_one_in=10000 --sample_for_compression=5 --secondary_cache_fault_one_in=32 --secondary_cache_uri= --set_options_one_in=10000 --skip_stats_update_on_db_open=0 --snapshot_hold_ops=100000 --soft_pending_compaction_bytes_limit=68719476736 --sqfc_name=bar --sqfc_version=1 --sst_file_manager_bytes_per_sec=104857600 --sst_file_manager_bytes_per_truncate=0 --stats_dump_period_sec=0 --stats_history_buffer_size=1048576 --strict_bytes_per_sync=1 --subcompactions=3 --sync=0 --sync_fault_injection=1 --table_cache_numshardbits=0 --target_file_size_base=524288 --target_file_size_multiplier=2 --test_batches_snapshots=0 --test_cf_consistency=1 --top_level_index_pinning=1 --uncache_aggressiveness=5 --universal_max_read_amp=-1 --unpartitioned_pinning=2 --use_adaptive_mutex=0 --use_adaptive_mutex_lru=0 --use_attribute_group=1 --use_delta_encoding=1 --use_direct_io_for_flush_and_compaction=0 --use_direct_reads=0 --use_full_merge_v1=0 --use_get_entity=0 --use_merge=0 --use_multi_cf_iterator=0 --use_multi_get_entity=0 --use_multiget=1 --use_put_entity_one_in=1 --use_sqfc_for_range_queries=1 --use_timed_put_one_in=0 --use_write_buffer_manager=0 --user_timestamp_size=0 --value_size_mult=32 --verification_only=0 --verify_checksum=1 --verify_checksum_one_in=1000000 --verify_compression=1 --verify_db_one_in=100000 --verify_file_checksums_one_in=0 --verify_iterator_with_expected_state_one_in=0 --verify_sst_unique_id_in_manifest=1 --wal_bytes_per_sync=0 --wal_compression=none --write_buffer_size=1048576 --write_dbid_to_manifest=1 --write_fault_one_in=0 --writepercent=35 ``` Reviewed By: cbi42 Differential Revision: D58958390 Pulled By: pdillinger fbshipit-source-id: 1271cfdcc3c574f78cd59f3c68148f7ed4a19c47
371 lines
12 KiB
C++
371 lines
12 KiB
C++
// Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
//
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
#include "table/compaction_merging_iterator.h"
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
class CompactionMergingIterator : public InternalIterator {
|
|
public:
|
|
CompactionMergingIterator(
|
|
const InternalKeyComparator* comparator, InternalIterator** children,
|
|
int n, bool is_arena_mode,
|
|
std::vector<std::pair<std::unique_ptr<TruncatedRangeDelIterator>,
|
|
std::unique_ptr<TruncatedRangeDelIterator>**>>&
|
|
range_tombstones)
|
|
: is_arena_mode_(is_arena_mode),
|
|
comparator_(comparator),
|
|
current_(nullptr),
|
|
minHeap_(CompactionHeapItemComparator(comparator_)),
|
|
pinned_iters_mgr_(nullptr) {
|
|
children_.resize(n);
|
|
for (int i = 0; i < n; i++) {
|
|
children_[i].level = i;
|
|
children_[i].iter.Set(children[i]);
|
|
assert(children_[i].type == HeapItem::ITERATOR);
|
|
}
|
|
assert(range_tombstones.size() == static_cast<size_t>(n));
|
|
for (auto& p : range_tombstones) {
|
|
range_tombstone_iters_.push_back(std::move(p.first));
|
|
}
|
|
pinned_heap_item_.resize(n);
|
|
for (int i = 0; i < n; ++i) {
|
|
if (range_tombstones[i].second) {
|
|
// for LevelIterator
|
|
*range_tombstones[i].second = &range_tombstone_iters_[i];
|
|
}
|
|
pinned_heap_item_[i].level = i;
|
|
pinned_heap_item_[i].type = HeapItem::DELETE_RANGE_START;
|
|
}
|
|
}
|
|
|
|
void considerStatus(const Status& s) {
|
|
if (!s.ok() && status_.ok()) {
|
|
status_ = s;
|
|
}
|
|
}
|
|
|
|
~CompactionMergingIterator() override {
|
|
range_tombstone_iters_.clear();
|
|
|
|
for (auto& child : children_) {
|
|
child.iter.DeleteIter(is_arena_mode_);
|
|
}
|
|
status_.PermitUncheckedError();
|
|
}
|
|
|
|
bool Valid() const override { return current_ != nullptr && status_.ok(); }
|
|
|
|
Status status() const override { return status_; }
|
|
|
|
void SeekToFirst() override;
|
|
|
|
void Seek(const Slice& target) override;
|
|
|
|
void Next() override;
|
|
|
|
Slice key() const override {
|
|
assert(Valid());
|
|
return current_->key();
|
|
}
|
|
|
|
Slice value() const override {
|
|
assert(Valid());
|
|
if (LIKELY(current_->type == HeapItem::ITERATOR)) {
|
|
return current_->iter.value();
|
|
} else {
|
|
return dummy_tombstone_val;
|
|
}
|
|
}
|
|
|
|
// Here we simply relay MayBeOutOfLowerBound/MayBeOutOfUpperBound result
|
|
// from current child iterator. Potentially as long as one of child iterator
|
|
// report out of bound is not possible, we know current key is within bound.
|
|
bool MayBeOutOfLowerBound() override {
|
|
assert(Valid());
|
|
return current_->type == HeapItem::DELETE_RANGE_START ||
|
|
current_->iter.MayBeOutOfLowerBound();
|
|
}
|
|
|
|
IterBoundCheck UpperBoundCheckResult() override {
|
|
assert(Valid());
|
|
return current_->type == HeapItem::DELETE_RANGE_START
|
|
? IterBoundCheck::kUnknown
|
|
: current_->iter.UpperBoundCheckResult();
|
|
}
|
|
|
|
void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
|
|
pinned_iters_mgr_ = pinned_iters_mgr;
|
|
for (auto& child : children_) {
|
|
child.iter.SetPinnedItersMgr(pinned_iters_mgr);
|
|
}
|
|
}
|
|
|
|
bool IsDeleteRangeSentinelKey() const override {
|
|
assert(Valid());
|
|
return current_->type == HeapItem::DELETE_RANGE_START;
|
|
}
|
|
|
|
// Compaction uses the above subset of InternalIterator interface.
|
|
void SeekToLast() override { assert(false); }
|
|
|
|
void SeekForPrev(const Slice&) override { assert(false); }
|
|
|
|
void Prev() override { assert(false); }
|
|
|
|
bool NextAndGetResult(IterateResult*) override {
|
|
assert(false);
|
|
return false;
|
|
}
|
|
|
|
bool IsKeyPinned() const override {
|
|
assert(false);
|
|
return false;
|
|
}
|
|
|
|
bool IsValuePinned() const override {
|
|
assert(false);
|
|
return false;
|
|
}
|
|
|
|
bool PrepareValue() override {
|
|
assert(false);
|
|
return false;
|
|
}
|
|
|
|
private:
|
|
struct HeapItem {
|
|
HeapItem() = default;
|
|
|
|
IteratorWrapper iter;
|
|
size_t level = 0;
|
|
std::string tombstone_str;
|
|
enum Type { ITERATOR, DELETE_RANGE_START };
|
|
Type type = ITERATOR;
|
|
|
|
explicit HeapItem(size_t _level, InternalIteratorBase<Slice>* _iter)
|
|
: level(_level), type(Type::ITERATOR) {
|
|
iter.Set(_iter);
|
|
}
|
|
|
|
void SetTombstoneForCompaction(const ParsedInternalKey&& pik) {
|
|
tombstone_str.clear();
|
|
AppendInternalKey(&tombstone_str, pik);
|
|
}
|
|
|
|
[[nodiscard]] Slice key() const {
|
|
return type == ITERATOR ? iter.key() : tombstone_str;
|
|
}
|
|
};
|
|
|
|
class CompactionHeapItemComparator {
|
|
public:
|
|
explicit CompactionHeapItemComparator(
|
|
const InternalKeyComparator* comparator)
|
|
: comparator_(comparator) {}
|
|
|
|
bool operator()(HeapItem* a, HeapItem* b) const {
|
|
int r = comparator_->Compare(a->key(), b->key());
|
|
// For each file, we assume all range tombstone start keys come before
|
|
// its file boundary sentinel key (file's meta.largest key).
|
|
// In the case when meta.smallest = meta.largest and range tombstone start
|
|
// key is truncated at meta.smallest, the start key will have op_type =
|
|
// kMaxValid to make it smaller (see TruncatedRangeDelIterator
|
|
// constructor). The following assertion validates this assumption.
|
|
assert(a->type == b->type || r != 0);
|
|
return r > 0;
|
|
}
|
|
|
|
private:
|
|
const InternalKeyComparator* comparator_;
|
|
};
|
|
|
|
using CompactionMinHeap = BinaryHeap<HeapItem*, CompactionHeapItemComparator>;
|
|
bool is_arena_mode_;
|
|
const InternalKeyComparator* comparator_;
|
|
// HeapItem for all child point iterators.
|
|
std::vector<HeapItem> children_;
|
|
// HeapItem for range tombstones. pinned_heap_item_[i] corresponds to the
|
|
// current range tombstone from range_tombstone_iters_[i].
|
|
std::vector<HeapItem> pinned_heap_item_;
|
|
// range_tombstone_iters_[i] contains range tombstones in the sorted run that
|
|
// corresponds to children_[i]. range_tombstone_iters_[i] ==
|
|
// nullptr means the sorted run of children_[i] does not have range
|
|
// tombstones (or the current SSTable does not have range tombstones in the
|
|
// case of LevelIterator).
|
|
std::vector<std::unique_ptr<TruncatedRangeDelIterator>>
|
|
range_tombstone_iters_;
|
|
// Used as value for range tombstone keys
|
|
std::string dummy_tombstone_val{};
|
|
|
|
// Skip file boundary sentinel keys.
|
|
void FindNextVisibleKey();
|
|
|
|
// top of minHeap_
|
|
HeapItem* current_;
|
|
// If any of the children have non-ok status, this is one of them.
|
|
Status status_;
|
|
CompactionMinHeap minHeap_;
|
|
PinnedIteratorsManager* pinned_iters_mgr_;
|
|
// Process a child that is not in the min heap.
|
|
// If valid, add to the min heap. Otherwise, check status.
|
|
void AddToMinHeapOrCheckStatus(HeapItem*);
|
|
|
|
HeapItem* CurrentForward() const {
|
|
return !minHeap_.empty() ? minHeap_.top() : nullptr;
|
|
}
|
|
|
|
void InsertRangeTombstoneAtLevel(size_t level) {
|
|
if (range_tombstone_iters_[level]->Valid()) {
|
|
pinned_heap_item_[level].SetTombstoneForCompaction(
|
|
range_tombstone_iters_[level]->start_key());
|
|
minHeap_.push(&pinned_heap_item_[level]);
|
|
}
|
|
}
|
|
};
|
|
|
|
void CompactionMergingIterator::SeekToFirst() {
|
|
minHeap_.clear();
|
|
status_ = Status::OK();
|
|
for (auto& child : children_) {
|
|
child.iter.SeekToFirst();
|
|
AddToMinHeapOrCheckStatus(&child);
|
|
}
|
|
|
|
for (size_t i = 0; i < range_tombstone_iters_.size(); ++i) {
|
|
if (range_tombstone_iters_[i]) {
|
|
range_tombstone_iters_[i]->SeekToFirst();
|
|
InsertRangeTombstoneAtLevel(i);
|
|
}
|
|
}
|
|
|
|
FindNextVisibleKey();
|
|
current_ = CurrentForward();
|
|
}
|
|
|
|
void CompactionMergingIterator::Seek(const Slice& target) {
|
|
minHeap_.clear();
|
|
status_ = Status::OK();
|
|
for (auto& child : children_) {
|
|
child.iter.Seek(target);
|
|
AddToMinHeapOrCheckStatus(&child);
|
|
}
|
|
|
|
ParsedInternalKey pik;
|
|
ParseInternalKey(target, &pik, false /* log_err_key */)
|
|
.PermitUncheckedError();
|
|
for (size_t i = 0; i < range_tombstone_iters_.size(); ++i) {
|
|
if (range_tombstone_iters_[i]) {
|
|
range_tombstone_iters_[i]->Seek(pik.user_key);
|
|
// For compaction, output keys should all be after seek target.
|
|
while (range_tombstone_iters_[i]->Valid() &&
|
|
comparator_->Compare(range_tombstone_iters_[i]->start_key(), pik) <
|
|
0) {
|
|
range_tombstone_iters_[i]->Next();
|
|
}
|
|
InsertRangeTombstoneAtLevel(i);
|
|
}
|
|
}
|
|
|
|
FindNextVisibleKey();
|
|
current_ = CurrentForward();
|
|
}
|
|
|
|
void CompactionMergingIterator::Next() {
|
|
assert(Valid());
|
|
// For the heap modifications below to be correct, current_ must be the
|
|
// current top of the heap.
|
|
assert(current_ == CurrentForward());
|
|
// as the current points to the current record. move the iterator forward.
|
|
if (current_->type == HeapItem::ITERATOR) {
|
|
current_->iter.Next();
|
|
if (current_->iter.Valid()) {
|
|
// current is still valid after the Next() call above. Call
|
|
// replace_top() to restore the heap property. When the same child
|
|
// iterator yields a sequence of keys, this is cheap.
|
|
assert(current_->iter.status().ok());
|
|
minHeap_.replace_top(current_);
|
|
} else {
|
|
// current stopped being valid, remove it from the heap.
|
|
considerStatus(current_->iter.status());
|
|
minHeap_.pop();
|
|
}
|
|
} else {
|
|
assert(current_->type == HeapItem::DELETE_RANGE_START);
|
|
size_t level = current_->level;
|
|
assert(range_tombstone_iters_[level]);
|
|
range_tombstone_iters_[level]->Next();
|
|
if (range_tombstone_iters_[level]->Valid()) {
|
|
pinned_heap_item_[level].SetTombstoneForCompaction(
|
|
range_tombstone_iters_[level]->start_key());
|
|
minHeap_.replace_top(&pinned_heap_item_[level]);
|
|
} else {
|
|
minHeap_.pop();
|
|
}
|
|
}
|
|
FindNextVisibleKey();
|
|
current_ = CurrentForward();
|
|
}
|
|
|
|
void CompactionMergingIterator::FindNextVisibleKey() {
|
|
while (!minHeap_.empty()) {
|
|
HeapItem* current = minHeap_.top();
|
|
// IsDeleteRangeSentinelKey() here means file boundary sentinel keys.
|
|
if (current->type != HeapItem::ITERATOR ||
|
|
!current->iter.IsDeleteRangeSentinelKey()) {
|
|
return;
|
|
}
|
|
// range tombstone start keys from the same SSTable should have been
|
|
// exhausted
|
|
assert(!range_tombstone_iters_[current->level] ||
|
|
!range_tombstone_iters_[current->level]->Valid());
|
|
// current->iter is a LevelIterator, and it enters a new SST file in the
|
|
// Next() call here.
|
|
current->iter.Next();
|
|
if (current->iter.Valid()) {
|
|
assert(current->iter.status().ok());
|
|
minHeap_.replace_top(current);
|
|
} else {
|
|
considerStatus(current->iter.status());
|
|
minHeap_.pop();
|
|
}
|
|
if (range_tombstone_iters_[current->level]) {
|
|
InsertRangeTombstoneAtLevel(current->level);
|
|
}
|
|
}
|
|
}
|
|
|
|
void CompactionMergingIterator::AddToMinHeapOrCheckStatus(HeapItem* child) {
|
|
if (child->iter.Valid()) {
|
|
assert(child->iter.status().ok());
|
|
minHeap_.push(child);
|
|
} else {
|
|
considerStatus(child->iter.status());
|
|
}
|
|
}
|
|
|
|
InternalIterator* NewCompactionMergingIterator(
|
|
const InternalKeyComparator* comparator, InternalIterator** children, int n,
|
|
std::vector<std::pair<std::unique_ptr<TruncatedRangeDelIterator>,
|
|
std::unique_ptr<TruncatedRangeDelIterator>**>>&
|
|
range_tombstone_iters,
|
|
Arena* arena) {
|
|
assert(n >= 0);
|
|
if (n == 0) {
|
|
return NewEmptyInternalIterator<Slice>(arena);
|
|
} else {
|
|
if (arena == nullptr) {
|
|
return new CompactionMergingIterator(comparator, children, n,
|
|
false /* is_arena_mode */,
|
|
range_tombstone_iters);
|
|
} else {
|
|
auto mem = arena->AllocateAligned(sizeof(CompactionMergingIterator));
|
|
return new (mem) CompactionMergingIterator(comparator, children, n,
|
|
true /* is_arena_mode */,
|
|
range_tombstone_iters);
|
|
}
|
|
}
|
|
}
|
|
} // namespace ROCKSDB_NAMESPACE
|