mirror of
https://github.com/facebook/rocksdb.git
synced 2024-11-27 20:43:57 +00:00
6997a06c63
Summary:
CI has been hitting assertion error like
```
https://github.com/facebook/rocksdb/issues/8 0x00007fafd9294fd6 in __GI___assert_fail (assertion=assertion@entry=0x7fafda270300 "!*memtable_range_tombstone_iter_ || sv_number_ != cfd_->GetSuperVersionNumber()", file=file@entry=0x7fafda270350 "db/arena_wrapped_db_iter.cc", line=line@entry=124, function=function@entry=0x7fafda270288 "virtual rocksdb::Status rocksdb::ArenaWrappedDBIter::Refresh(const rocksdb::Snapshot*)") at assert.c:101
```
This is due to
* Iterator::Refresh() passing in `cur_sv_number` instead of `sv->version_number` here: 1c6faf3587/db/arena_wrapped_db_iter.cc (L94-L96)
* `super_version_number_` can be incremented before thread local SV is installed: https://github.com/facebook/rocksdb/blob/main/db/column_family.cc#L1287-L1306
* The optimization in https://github.com/facebook/rocksdb/issues/11452 removed the check for SV number, such that `cur_sv_number > sv.version_number` is possible in the following code.
```
uint64_t cur_sv_number = cfd_->GetSuperVersionNumber();
SuperVersion* sv = cfd_->GetReferencedSuperVersion(db_impl_);
```
Not sure why assertion only started failing after https://github.com/facebook/rocksdb/issues/10594, maybe it's because Refresh() is called more often in stress test.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11848
Test Plan:
* This repros hits the assertion pretty consistently before this change:
```
./db_stress --acquire_snapshot_one_in=10000 --adaptive_readahead=1 --allow_data_in_errors=True --async_io=0 --atomic_flush=1 --auto_readahead_size=1 --avoid_flush_during_recovery=0 --avoid_unnecessary_blocking_io=1 --backup_one_in=0 --block_size=16384 --bloom_bits=0.7161318870366848 --cache_index_and_filter_blocks=0 --cache_size=8388608 --charge_table_reader=0 --checkpoint_one_in=1000000 --checksum_type=kxxHash --clear_column_family_one_in=0 --compact_files_one_in=1000000 --compact_range_one_in=1000000 --compaction_pri=3 --compaction_readahead_size=0 --compaction_ttl=0 --compression_checksum=0 --compression_max_dict_buffer_bytes=0 --compression_max_dict_bytes=0 --compression_parallel_threads=1 --compression_type=zlib --compression_use_zstd_dict_trainer=0 --compression_zstd_max_train_bytes=0 --continuous_verification_interval=0 --data_block_index_type=0 --db_write_buffer_size=8388608 --delpercent=4 --delrangepercent=1 --destroy_db_initially=1 --detect_filter_construct_corruption=0 --disable_wal=1 --enable_compaction_filter=0 --enable_pipelined_write=0 --enable_thread_tracking=1 --fail_if_options_file_error=0 --fifo_allow_compaction=1 --file_checksum_impl=none --flush_one_in=1000000 --format_version=5 --get_current_wal_file_one_in=0 --get_live_files_one_in=1000000 --get_property_one_in=1000000 --get_sorted_wal_files_one_in=0 --index_block_restart_interval=14 --index_type=2 --ingest_external_file_one_in=0 --initial_auto_readahead_size=524288 --iterpercent=30 --key_len_percent_dist=1,30,69 --level_compaction_dynamic_level_bytes=1 --lock_wal_one_in=1000000 --long_running_snapshots=1 --manual_wal_flush_one_in=0 --mark_for_compaction_one_file_in=0 --max_auto_readahead_size=524288 --max_background_compactions=20 --max_bytes_for_level_base=10485760 --max_key=2500000 --max_key_len=3 --max_manifest_file_size=1073741824 --max_write_batch_group_size_bytes=16777216 --max_write_buffer_number=3 --max_write_buffer_size_to_maintain=1048576 --memtable_max_range_deletions=0 --memtable_prefix_bloom_size_ratio=0.5 --memtable_protection_bytes_per_key=0 --memtable_whole_key_filtering=1 --memtablerep=skip_list --min_write_buffer_number_to_merge=1 --mmap_read=1 --mock_direct_io=False --nooverwritepercent=1 --num_file_reads_for_auto_readahead=1 --open_files=500000 --ops_per_thread=100000000 --optimize_filters_for_memory=1 --paranoid_file_checks=0 --partition_filters=0 --partition_pinning=3 --pause_background_one_in=1000000 --periodic_compaction_seconds=0 --prefix_size=-1 --prefixpercent=0 --prepopulate_block_cache=0 --preserve_internal_time_seconds=0 --progress_reports=0 --read_fault_one_in=32 --readahead_size=16384 --readpercent=30 --recycle_log_file_num=1 --reopen=0 --ribbon_starting_level=999 --secondary_cache_fault_one_in=0 --secondary_cache_uri= --set_options_one_in=10000 --snapshot_hold_ops=100000 --sst_file_manager_bytes_per_sec=104857600 --sst_file_manager_bytes_per_truncate=0 --stats_dump_period_sec=600 --subcompactions=1 --sync=0 --sync_fault_injection=1 --target_file_size_base=2097152 --target_file_size_multiplier=2 --test_batches_snapshots=0 --test_cf_consistency=1 --top_level_index_pinning=3 --unpartitioned_pinning=3 --use_direct_io_for_flush_and_compaction=0 --use_direct_reads=0 --use_full_merge_v1=0 --use_get_entity=0 --use_merge=0 --use_multi_get_entity=0 --use_multiget=1 --use_put_entity_one_in=0 --user_timestamp_size=0 --value_size_mult=32 --verification_only=0 --verify_checksum=1 --verify_checksum_one_in=1000000 --verify_db_one_in=100000 --verify_file_checksums_one_in=0 --verify_sst_unique_id_in_manifest=1 --wal_bytes_per_sync=0 --wal_compression=none --write_buffer_size=1048576 --write_dbid_to_manifest=1 --write_fault_one_in=0 --writepercent=35 --use_io_uring=0 --db=/tmp/rocksdb_crashtest_blackboxnf3pyv_0 --expected_values_dir=/tmp/rocksdb_crashtest_expected_6opy9nqg
```
Reviewed By: ajkr
Differential Revision: D49344066
Pulled By: cbi42
fbshipit-source-id: d5373ddb48d933acb42a5dd8fae3f3019b0241e5
180 lines
7.4 KiB
C++
180 lines
7.4 KiB
C++
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
//
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
#include "db/arena_wrapped_db_iter.h"
|
|
|
|
#include "memory/arena.h"
|
|
#include "rocksdb/env.h"
|
|
#include "rocksdb/iterator.h"
|
|
#include "rocksdb/options.h"
|
|
#include "table/internal_iterator.h"
|
|
#include "table/iterator_wrapper.h"
|
|
#include "util/user_comparator_wrapper.h"
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
inline static SequenceNumber GetSeqNum(const DBImpl* db, const Snapshot* s) {
|
|
if (s) {
|
|
return s->GetSequenceNumber();
|
|
} else {
|
|
return db->GetLatestSequenceNumber();
|
|
}
|
|
}
|
|
|
|
Status ArenaWrappedDBIter::GetProperty(std::string prop_name,
|
|
std::string* prop) {
|
|
if (prop_name == "rocksdb.iterator.super-version-number") {
|
|
// First try to pass the value returned from inner iterator.
|
|
if (!db_iter_->GetProperty(prop_name, prop).ok()) {
|
|
*prop = std::to_string(sv_number_);
|
|
}
|
|
return Status::OK();
|
|
}
|
|
return db_iter_->GetProperty(prop_name, prop);
|
|
}
|
|
|
|
void ArenaWrappedDBIter::Init(
|
|
Env* env, const ReadOptions& read_options, const ImmutableOptions& ioptions,
|
|
const MutableCFOptions& mutable_cf_options, const Version* version,
|
|
const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iteration,
|
|
uint64_t version_number, ReadCallback* read_callback, DBImpl* db_impl,
|
|
ColumnFamilyData* cfd, bool expose_blob_index, bool allow_refresh) {
|
|
auto mem = arena_.AllocateAligned(sizeof(DBIter));
|
|
db_iter_ =
|
|
new (mem) DBIter(env, read_options, ioptions, mutable_cf_options,
|
|
ioptions.user_comparator, /* iter */ nullptr, version,
|
|
sequence, true, max_sequential_skip_in_iteration,
|
|
read_callback, db_impl, cfd, expose_blob_index);
|
|
sv_number_ = version_number;
|
|
read_options_ = read_options;
|
|
allow_refresh_ = allow_refresh;
|
|
memtable_range_tombstone_iter_ = nullptr;
|
|
|
|
if (!CheckFSFeatureSupport(env->GetFileSystem().get(),
|
|
FSSupportedOps::kAsyncIO)) {
|
|
read_options_.async_io = false;
|
|
}
|
|
}
|
|
|
|
Status ArenaWrappedDBIter::Refresh() { return Refresh(nullptr); }
|
|
|
|
Status ArenaWrappedDBIter::Refresh(const Snapshot* snapshot) {
|
|
if (cfd_ == nullptr || db_impl_ == nullptr || !allow_refresh_) {
|
|
return Status::NotSupported("Creating renew iterator is not allowed.");
|
|
}
|
|
assert(db_iter_ != nullptr);
|
|
// TODO(yiwu): For last_seq_same_as_publish_seq_==false, this is not the
|
|
// correct behavior. Will be corrected automatically when we take a snapshot
|
|
// here for the case of WritePreparedTxnDB.
|
|
uint64_t cur_sv_number = cfd_->GetSuperVersionNumber();
|
|
// If we recreate a new internal iterator below (NewInternalIterator()),
|
|
// we will pass in read_options_. We need to make sure it
|
|
// has the right snapshot.
|
|
read_options_.snapshot = snapshot;
|
|
TEST_SYNC_POINT("ArenaWrappedDBIter::Refresh:1");
|
|
TEST_SYNC_POINT("ArenaWrappedDBIter::Refresh:2");
|
|
auto reinit_internal_iter = [&]() {
|
|
Env* env = db_iter_->env();
|
|
db_iter_->~DBIter();
|
|
arena_.~Arena();
|
|
new (&arena_) Arena();
|
|
|
|
SuperVersion* sv = cfd_->GetReferencedSuperVersion(db_impl_);
|
|
assert(sv->version_number >= cur_sv_number);
|
|
SequenceNumber read_seq = GetSeqNum(db_impl_, snapshot);
|
|
if (read_callback_) {
|
|
read_callback_->Refresh(read_seq);
|
|
}
|
|
Init(env, read_options_, *(cfd_->ioptions()), sv->mutable_cf_options,
|
|
sv->current, read_seq,
|
|
sv->mutable_cf_options.max_sequential_skip_in_iterations,
|
|
sv->version_number, read_callback_, db_impl_, cfd_, expose_blob_index_,
|
|
allow_refresh_);
|
|
|
|
InternalIterator* internal_iter = db_impl_->NewInternalIterator(
|
|
read_options_, cfd_, sv, &arena_, read_seq,
|
|
/* allow_unprepared_value */ true, /* db_iter */ this);
|
|
SetIterUnderDBIter(internal_iter);
|
|
};
|
|
while (true) {
|
|
if (sv_number_ != cur_sv_number) {
|
|
reinit_internal_iter();
|
|
break;
|
|
} else {
|
|
SequenceNumber read_seq = GetSeqNum(db_impl_, snapshot);
|
|
// Refresh range-tombstones in MemTable
|
|
if (!read_options_.ignore_range_deletions) {
|
|
SuperVersion* sv = cfd_->GetThreadLocalSuperVersion(db_impl_);
|
|
TEST_SYNC_POINT_CALLBACK("ArenaWrappedDBIter::Refresh:SV", nullptr);
|
|
auto t = sv->mem->NewRangeTombstoneIterator(
|
|
read_options_, read_seq, false /* immutable_memtable */);
|
|
if (!t || t->empty()) {
|
|
// If memtable_range_tombstone_iter_ points to a non-empty tombstone
|
|
// iterator, then it means sv->mem is not the memtable that
|
|
// memtable_range_tombstone_iter_ points to, so SV must have changed
|
|
// after the sv_number_ != cur_sv_number check above. We will fall
|
|
// back to re-init the InternalIterator, and the tombstone iterator
|
|
// will be freed during db_iter destruction there.
|
|
if (memtable_range_tombstone_iter_) {
|
|
assert(!*memtable_range_tombstone_iter_ ||
|
|
sv_number_ != cfd_->GetSuperVersionNumber());
|
|
}
|
|
delete t;
|
|
} else { // current mutable memtable has range tombstones
|
|
if (!memtable_range_tombstone_iter_) {
|
|
delete t;
|
|
db_impl_->ReturnAndCleanupSuperVersion(cfd_, sv);
|
|
// The memtable under DBIter did not have range tombstone before
|
|
// refresh.
|
|
reinit_internal_iter();
|
|
break;
|
|
} else {
|
|
delete *memtable_range_tombstone_iter_;
|
|
*memtable_range_tombstone_iter_ = new TruncatedRangeDelIterator(
|
|
std::unique_ptr<FragmentedRangeTombstoneIterator>(t),
|
|
&cfd_->internal_comparator(), nullptr, nullptr);
|
|
}
|
|
}
|
|
db_impl_->ReturnAndCleanupSuperVersion(cfd_, sv);
|
|
}
|
|
// Check again if the latest super version number is changed
|
|
uint64_t latest_sv_number = cfd_->GetSuperVersionNumber();
|
|
if (latest_sv_number != cur_sv_number) {
|
|
// If the super version number is changed after refreshing,
|
|
// fallback to Re-Init the InternalIterator
|
|
cur_sv_number = latest_sv_number;
|
|
continue;
|
|
}
|
|
db_iter_->set_sequence(read_seq);
|
|
db_iter_->set_valid(false);
|
|
break;
|
|
}
|
|
}
|
|
return Status::OK();
|
|
}
|
|
|
|
ArenaWrappedDBIter* NewArenaWrappedDbIterator(
|
|
Env* env, const ReadOptions& read_options, const ImmutableOptions& ioptions,
|
|
const MutableCFOptions& mutable_cf_options, const Version* version,
|
|
const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations,
|
|
uint64_t version_number, ReadCallback* read_callback, DBImpl* db_impl,
|
|
ColumnFamilyData* cfd, bool expose_blob_index, bool allow_refresh) {
|
|
ArenaWrappedDBIter* iter = new ArenaWrappedDBIter();
|
|
iter->Init(env, read_options, ioptions, mutable_cf_options, version, sequence,
|
|
max_sequential_skip_in_iterations, version_number, read_callback,
|
|
db_impl, cfd, expose_blob_index, allow_refresh);
|
|
if (db_impl != nullptr && cfd != nullptr && allow_refresh) {
|
|
iter->StoreRefreshInfo(db_impl, cfd, read_callback, expose_blob_index);
|
|
}
|
|
|
|
return iter;
|
|
}
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|