mirror of
https://github.com/facebook/rocksdb.git
synced 2024-12-04 20:02:50 +00:00
39455974cb
Summary: Not sure where or how it happens, but using a recent CircleCI failure I got a reliable db_stress reproducer. Using std::unique_ptr appropriately for managing them has apparently (and unsurprisingly) fixed the problem without needing to know exactly where the problem was. Suggested follow-up: * Three or even four levels of pointers is very confusing to work with. Surely this part can be cleaned up to be simpler. Pull Request resolved: https://github.com/facebook/rocksdb/pull/12805 Test Plan: Reproducer passes, plus ASAN test and crash test runs. I don't think it's worth the extra work to track down the details and create a careful unit test. ``` ./db_stress --WAL_size_limit_MB=1 --WAL_ttl_seconds=60 --acquire_snapshot_one_in=10000 --adaptive_readahead=1 --adm_policy=2 --advise_random_on_open=1 --allow_data_in_errors=True --allow_fallocate=1 --async_io=0 --auto_readahead_size=1 --avoid_flush_during_recovery=0 --avoid_flush_during_shutdown=1 --avoid_unnecessary_blocking_io=1 --backup_max_size=104857600 --backup_one_in=100000 --batch_protection_bytes_per_key=0 --bgerror_resume_retry_interval=1000000 --block_align=1 --block_protection_bytes_per_key=4 --block_size=16384 --bloom_before_level=2147483646 --bloom_bits=15 --bottommost_compression_type=none --bottommost_file_compaction_delay=3600 --bytes_per_sync=262144 --cache_index_and_filter_blocks=0 --cache_index_and_filter_blocks_with_high_priority=0 --cache_size=33554432 --cache_type=tiered_lru_cache --charge_compression_dictionary_building_buffer=0 --charge_file_metadata=1 --charge_filter_construction=0 --charge_table_reader=0 --check_multiget_consistency=1 --check_multiget_entity_consistency=1 --checkpoint_one_in=10000 --checksum_type=kxxHash --clear_column_family_one_in=0 --compact_files_one_in=1000000 --compact_range_one_in=1000 --compaction_pri=0 --compaction_readahead_size=0 --compaction_ttl=0 --compress_format_version=2 --compressed_secondary_cache_ratio=0.2 --compressed_secondary_cache_size=0 --compression_checksum=0 --compression_max_dict_buffer_bytes=0 --compression_max_dict_bytes=0 --compression_parallel_threads=1 --compression_type=none --compression_use_zstd_dict_trainer=0 --compression_zstd_max_train_bytes=0 --continuous_verification_interval=0 --daily_offpeak_time_utc= --data_block_index_type=0 --db=/dev/shm/rocksdb.gpxs/rocksdb_crashtest_blackbox --db_write_buffer_size=0 --default_temperature=kWarm --default_write_temperature=kCold --delete_obsolete_files_period_micros=21600000000 --delpercent=4 --delrangepercent=1 --destroy_db_initially=0 --detect_filter_construct_corruption=0 --disable_file_deletions_one_in=10000 --disable_manual_compaction_one_in=1000000 --disable_wal=0 --dump_malloc_stats=1 --enable_checksum_handoff=1 --enable_compaction_filter=0 --enable_custom_split_merge=0 --enable_do_not_compress_roles=0 --enable_index_compression=0 --enable_memtable_insert_with_hint_prefix_extractor=0 --enable_pipelined_write=1 --enable_sst_partitioner_factory=0 --enable_thread_tracking=1 --enable_write_thread_adaptive_yield=0 --error_recovery_with_no_fault_injection=0 --expected_values_dir=/dev/shm/rocksdb.gpxs/rocksdb_crashtest_expected --fail_if_options_file_error=0 --fifo_allow_compaction=0 --file_checksum_impl=none --fill_cache=1 --flush_one_in=1000000 --format_version=3 --get_all_column_family_metadata_one_in=1000000 --get_current_wal_file_one_in=0 --get_live_files_apis_one_in=10000 --get_properties_of_all_tables_one_in=100000 --get_property_one_in=100000 --get_sorted_wal_files_one_in=0 --hard_pending_compaction_bytes_limit=274877906944 --high_pri_pool_ratio=0 --index_block_restart_interval=4 --index_shortening=0 --index_type=0 --ingest_external_file_one_in=0 --initial_auto_readahead_size=16384 --inplace_update_support=0 --iterpercent=10 --key_len_percent_dist=1,30,69 --key_may_exist_one_in=100 --last_level_temperature=kHot --level_compaction_dynamic_level_bytes=0 --lock_wal_one_in=1000000 --log_file_time_to_roll=0 --log_readahead_size=0 --long_running_snapshots=1 --low_pri_pool_ratio=0 --lowest_used_cache_tier=2 --manifest_preallocation_size=5120 --manual_wal_flush_one_in=1000 --mark_for_compaction_one_file_in=10 --max_auto_readahead_size=16384 --max_background_compactions=20 --max_bytes_for_level_base=10485760 --max_key=2500000 --max_key_len=3 --max_log_file_size=0 --max_manifest_file_size=1073741824 --max_sequential_skip_in_iterations=1 --max_total_wal_size=0 --max_write_batch_group_size_bytes=16 --max_write_buffer_number=3 --max_write_buffer_size_to_maintain=0 --memtable_insert_hint_per_batch=1 --memtable_max_range_deletions=100 --memtable_prefix_bloom_size_ratio=0 --memtable_protection_bytes_per_key=4 --memtable_whole_key_filtering=0 --memtablerep=skip_list --metadata_charge_policy=0 --metadata_read_fault_one_in=32 --metadata_write_fault_one_in=0 --min_write_buffer_number_to_merge=2 --mmap_read=1 --mock_direct_io=False --nooverwritepercent=1 --num_file_reads_for_auto_readahead=0 --open_files=100 --open_metadata_read_fault_one_in=0 --open_metadata_write_fault_one_in=8 --open_read_fault_one_in=0 --open_write_fault_one_in=16 --ops_per_thread=100000000 --optimize_filters_for_hits=1 --optimize_filters_for_memory=0 --optimize_multiget_for_io=1 --paranoid_file_checks=1 --partition_filters=0 --partition_pinning=1 --pause_background_one_in=1000000 --periodic_compaction_seconds=0 --prefix_size=-1 --prefixpercent=0 --prepopulate_block_cache=1 --preserve_internal_time_seconds=60 --progress_reports=0 --promote_l0_one_in=0 --read_amp_bytes_per_bit=0 --read_fault_one_in=32 --readahead_size=524288 --readpercent=50 --recycle_log_file_num=1 --reopen=0 --report_bg_io_stats=1 --reset_stats_one_in=10000 --sample_for_compression=5 --secondary_cache_fault_one_in=32 --secondary_cache_uri= --set_options_one_in=10000 --skip_stats_update_on_db_open=0 --snapshot_hold_ops=100000 --soft_pending_compaction_bytes_limit=68719476736 --sqfc_name=bar --sqfc_version=1 --sst_file_manager_bytes_per_sec=104857600 --sst_file_manager_bytes_per_truncate=0 --stats_dump_period_sec=0 --stats_history_buffer_size=1048576 --strict_bytes_per_sync=1 --subcompactions=3 --sync=0 --sync_fault_injection=1 --table_cache_numshardbits=0 --target_file_size_base=524288 --target_file_size_multiplier=2 --test_batches_snapshots=0 --test_cf_consistency=1 --top_level_index_pinning=1 --uncache_aggressiveness=5 --universal_max_read_amp=-1 --unpartitioned_pinning=2 --use_adaptive_mutex=0 --use_adaptive_mutex_lru=0 --use_attribute_group=1 --use_delta_encoding=1 --use_direct_io_for_flush_and_compaction=0 --use_direct_reads=0 --use_full_merge_v1=0 --use_get_entity=0 --use_merge=0 --use_multi_cf_iterator=0 --use_multi_get_entity=0 --use_multiget=1 --use_put_entity_one_in=1 --use_sqfc_for_range_queries=1 --use_timed_put_one_in=0 --use_write_buffer_manager=0 --user_timestamp_size=0 --value_size_mult=32 --verification_only=0 --verify_checksum=1 --verify_checksum_one_in=1000000 --verify_compression=1 --verify_db_one_in=100000 --verify_file_checksums_one_in=0 --verify_iterator_with_expected_state_one_in=0 --verify_sst_unique_id_in_manifest=1 --wal_bytes_per_sync=0 --wal_compression=none --write_buffer_size=1048576 --write_dbid_to_manifest=1 --write_fault_one_in=0 --writepercent=35 ``` Reviewed By: cbi42 Differential Revision: D58958390 Pulled By: pdillinger fbshipit-source-id: 1271cfdcc3c574f78cd59f3c68148f7ed4a19c47
746 lines
28 KiB
C++
746 lines
28 KiB
C++
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
//
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
#include "db/table_cache.h"
|
|
|
|
#include "db/dbformat.h"
|
|
#include "db/range_tombstone_fragmenter.h"
|
|
#include "db/snapshot_impl.h"
|
|
#include "db/version_edit.h"
|
|
#include "file/file_util.h"
|
|
#include "file/filename.h"
|
|
#include "file/random_access_file_reader.h"
|
|
#include "monitoring/perf_context_imp.h"
|
|
#include "rocksdb/advanced_options.h"
|
|
#include "rocksdb/statistics.h"
|
|
#include "table/block_based/block_based_table_reader.h"
|
|
#include "table/get_context.h"
|
|
#include "table/internal_iterator.h"
|
|
#include "table/iterator_wrapper.h"
|
|
#include "table/multiget_context.h"
|
|
#include "table/table_builder.h"
|
|
#include "table/table_reader.h"
|
|
#include "test_util/sync_point.h"
|
|
#include "util/cast_util.h"
|
|
#include "util/coding.h"
|
|
#include "util/stop_watch.h"
|
|
|
|
// Generate the regular and coroutine versions of some methods by
|
|
// including table_cache_sync_and_async.h twice
|
|
// Macros in the header will expand differently based on whether
|
|
// WITH_COROUTINES or WITHOUT_COROUTINES is defined
|
|
// clang-format off
|
|
#define WITHOUT_COROUTINES
|
|
#include "db/table_cache_sync_and_async.h"
|
|
#undef WITHOUT_COROUTINES
|
|
#define WITH_COROUTINES
|
|
#include "db/table_cache_sync_and_async.h"
|
|
#undef WITH_COROUTINES
|
|
// clang-format on
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
namespace {
|
|
|
|
static Slice GetSliceForFileNumber(const uint64_t* file_number) {
|
|
return Slice(reinterpret_cast<const char*>(file_number),
|
|
sizeof(*file_number));
|
|
}
|
|
|
|
|
|
void AppendVarint64(IterKey* key, uint64_t v) {
|
|
char buf[10];
|
|
auto ptr = EncodeVarint64(buf, v);
|
|
key->TrimAppend(key->Size(), buf, ptr - buf);
|
|
}
|
|
|
|
|
|
} // anonymous namespace
|
|
|
|
const int kLoadConcurency = 128;
|
|
|
|
TableCache::TableCache(const ImmutableOptions& ioptions,
|
|
const FileOptions* file_options, Cache* const cache,
|
|
BlockCacheTracer* const block_cache_tracer,
|
|
const std::shared_ptr<IOTracer>& io_tracer,
|
|
const std::string& db_session_id)
|
|
: ioptions_(ioptions),
|
|
file_options_(*file_options),
|
|
cache_(cache),
|
|
immortal_tables_(false),
|
|
block_cache_tracer_(block_cache_tracer),
|
|
loader_mutex_(kLoadConcurency),
|
|
io_tracer_(io_tracer),
|
|
db_session_id_(db_session_id) {
|
|
if (ioptions_.row_cache) {
|
|
// If the same cache is shared by multiple instances, we need to
|
|
// disambiguate its entries.
|
|
PutVarint64(&row_cache_id_, ioptions_.row_cache->NewId());
|
|
}
|
|
}
|
|
|
|
TableCache::~TableCache() = default;
|
|
|
|
Status TableCache::GetTableReader(
|
|
const ReadOptions& ro, const FileOptions& file_options,
|
|
const InternalKeyComparator& internal_comparator,
|
|
const FileMetaData& file_meta, bool sequential_mode,
|
|
uint8_t block_protection_bytes_per_key, HistogramImpl* file_read_hist,
|
|
std::unique_ptr<TableReader>* table_reader,
|
|
const std::shared_ptr<const SliceTransform>& prefix_extractor,
|
|
bool skip_filters, int level, bool prefetch_index_and_filter_in_cache,
|
|
size_t max_file_size_for_l0_meta_pin, Temperature file_temperature) {
|
|
std::string fname = TableFileName(
|
|
ioptions_.cf_paths, file_meta.fd.GetNumber(), file_meta.fd.GetPathId());
|
|
std::unique_ptr<FSRandomAccessFile> file;
|
|
FileOptions fopts = file_options;
|
|
fopts.temperature = file_temperature;
|
|
Status s = PrepareIOFromReadOptions(ro, ioptions_.clock, fopts.io_options);
|
|
TEST_SYNC_POINT_CALLBACK("TableCache::GetTableReader:BeforeOpenFile",
|
|
const_cast<Status*>(&s));
|
|
if (s.ok()) {
|
|
s = ioptions_.fs->NewRandomAccessFile(fname, fopts, &file, nullptr);
|
|
}
|
|
if (s.ok()) {
|
|
RecordTick(ioptions_.stats, NO_FILE_OPENS);
|
|
} else if (s.IsPathNotFound()) {
|
|
fname = Rocks2LevelTableFileName(fname);
|
|
// If this file is also not found, we want to use the error message
|
|
// that contains the table file name which is less confusing.
|
|
Status temp_s =
|
|
PrepareIOFromReadOptions(ro, ioptions_.clock, fopts.io_options);
|
|
if (temp_s.ok()) {
|
|
temp_s = ioptions_.fs->NewRandomAccessFile(fname, file_options, &file,
|
|
nullptr);
|
|
}
|
|
if (temp_s.ok()) {
|
|
RecordTick(ioptions_.stats, NO_FILE_OPENS);
|
|
s = temp_s;
|
|
}
|
|
}
|
|
|
|
if (s.ok()) {
|
|
if (!sequential_mode && ioptions_.advise_random_on_open) {
|
|
file->Hint(FSRandomAccessFile::kRandom);
|
|
}
|
|
if (ioptions_.default_temperature != Temperature::kUnknown &&
|
|
file_temperature == Temperature::kUnknown) {
|
|
file_temperature = ioptions_.default_temperature;
|
|
}
|
|
StopWatch sw(ioptions_.clock, ioptions_.stats, TABLE_OPEN_IO_MICROS);
|
|
std::unique_ptr<RandomAccessFileReader> file_reader(
|
|
new RandomAccessFileReader(std::move(file), fname, ioptions_.clock,
|
|
io_tracer_, ioptions_.stats, SST_READ_MICROS,
|
|
file_read_hist, ioptions_.rate_limiter.get(),
|
|
ioptions_.listeners, file_temperature,
|
|
level == ioptions_.num_levels - 1));
|
|
UniqueId64x2 expected_unique_id;
|
|
if (ioptions_.verify_sst_unique_id_in_manifest) {
|
|
expected_unique_id = file_meta.unique_id;
|
|
} else {
|
|
expected_unique_id = kNullUniqueId64x2; // null ID == no verification
|
|
}
|
|
s = ioptions_.table_factory->NewTableReader(
|
|
ro,
|
|
TableReaderOptions(
|
|
ioptions_, prefix_extractor, file_options, internal_comparator,
|
|
block_protection_bytes_per_key, skip_filters, immortal_tables_,
|
|
false /* force_direct_prefetch */, level, block_cache_tracer_,
|
|
max_file_size_for_l0_meta_pin, db_session_id_,
|
|
file_meta.fd.GetNumber(), expected_unique_id,
|
|
file_meta.fd.largest_seqno, file_meta.tail_size,
|
|
file_meta.user_defined_timestamps_persisted),
|
|
std::move(file_reader), file_meta.fd.GetFileSize(), table_reader,
|
|
prefetch_index_and_filter_in_cache);
|
|
TEST_SYNC_POINT("TableCache::GetTableReader:0");
|
|
}
|
|
return s;
|
|
}
|
|
|
|
Cache::Handle* TableCache::Lookup(Cache* cache, uint64_t file_number) {
|
|
Slice key = GetSliceForFileNumber(&file_number);
|
|
return cache->Lookup(key);
|
|
}
|
|
|
|
Status TableCache::FindTable(
|
|
const ReadOptions& ro, const FileOptions& file_options,
|
|
const InternalKeyComparator& internal_comparator,
|
|
const FileMetaData& file_meta, TypedHandle** handle,
|
|
uint8_t block_protection_bytes_per_key,
|
|
const std::shared_ptr<const SliceTransform>& prefix_extractor,
|
|
const bool no_io, HistogramImpl* file_read_hist, bool skip_filters,
|
|
int level, bool prefetch_index_and_filter_in_cache,
|
|
size_t max_file_size_for_l0_meta_pin, Temperature file_temperature) {
|
|
PERF_TIMER_GUARD_WITH_CLOCK(find_table_nanos, ioptions_.clock);
|
|
uint64_t number = file_meta.fd.GetNumber();
|
|
Slice key = GetSliceForFileNumber(&number);
|
|
*handle = cache_.Lookup(key);
|
|
TEST_SYNC_POINT_CALLBACK("TableCache::FindTable:0",
|
|
const_cast<bool*>(&no_io));
|
|
|
|
if (*handle == nullptr) {
|
|
if (no_io) {
|
|
return Status::Incomplete("Table not found in table_cache, no_io is set");
|
|
}
|
|
MutexLock load_lock(&loader_mutex_.Get(key));
|
|
// We check the cache again under loading mutex
|
|
*handle = cache_.Lookup(key);
|
|
if (*handle != nullptr) {
|
|
return Status::OK();
|
|
}
|
|
|
|
std::unique_ptr<TableReader> table_reader;
|
|
Status s = GetTableReader(ro, file_options, internal_comparator, file_meta,
|
|
false /* sequential mode */,
|
|
block_protection_bytes_per_key, file_read_hist,
|
|
&table_reader, prefix_extractor, skip_filters,
|
|
level, prefetch_index_and_filter_in_cache,
|
|
max_file_size_for_l0_meta_pin, file_temperature);
|
|
if (!s.ok()) {
|
|
assert(table_reader == nullptr);
|
|
RecordTick(ioptions_.stats, NO_FILE_ERRORS);
|
|
// We do not cache error results so that if the error is transient,
|
|
// or somebody repairs the file, we recover automatically.
|
|
} else {
|
|
s = cache_.Insert(key, table_reader.get(), 1, handle);
|
|
if (s.ok()) {
|
|
// Release ownership of table reader.
|
|
table_reader.release();
|
|
}
|
|
}
|
|
return s;
|
|
}
|
|
return Status::OK();
|
|
}
|
|
|
|
InternalIterator* TableCache::NewIterator(
|
|
const ReadOptions& options, const FileOptions& file_options,
|
|
const InternalKeyComparator& icomparator, const FileMetaData& file_meta,
|
|
RangeDelAggregator* range_del_agg,
|
|
const std::shared_ptr<const SliceTransform>& prefix_extractor,
|
|
TableReader** table_reader_ptr, HistogramImpl* file_read_hist,
|
|
TableReaderCaller caller, Arena* arena, bool skip_filters, int level,
|
|
size_t max_file_size_for_l0_meta_pin,
|
|
const InternalKey* smallest_compaction_key,
|
|
const InternalKey* largest_compaction_key, bool allow_unprepared_value,
|
|
uint8_t block_protection_bytes_per_key, const SequenceNumber* read_seqno,
|
|
std::unique_ptr<TruncatedRangeDelIterator>* range_del_iter) {
|
|
PERF_TIMER_GUARD(new_table_iterator_nanos);
|
|
|
|
Status s;
|
|
TableReader* table_reader = nullptr;
|
|
TypedHandle* handle = nullptr;
|
|
if (table_reader_ptr != nullptr) {
|
|
*table_reader_ptr = nullptr;
|
|
}
|
|
bool for_compaction = caller == TableReaderCaller::kCompaction;
|
|
auto& fd = file_meta.fd;
|
|
table_reader = fd.table_reader;
|
|
if (table_reader == nullptr) {
|
|
s = FindTable(options, file_options, icomparator, file_meta, &handle,
|
|
block_protection_bytes_per_key, prefix_extractor,
|
|
options.read_tier == kBlockCacheTier /* no_io */,
|
|
file_read_hist, skip_filters, level,
|
|
true /* prefetch_index_and_filter_in_cache */,
|
|
max_file_size_for_l0_meta_pin, file_meta.temperature);
|
|
if (s.ok()) {
|
|
table_reader = cache_.Value(handle);
|
|
}
|
|
}
|
|
InternalIterator* result = nullptr;
|
|
if (s.ok()) {
|
|
if (options.table_filter &&
|
|
!options.table_filter(*table_reader->GetTableProperties())) {
|
|
result = NewEmptyInternalIterator<Slice>(arena);
|
|
} else {
|
|
result = table_reader->NewIterator(
|
|
options, prefix_extractor.get(), arena, skip_filters, caller,
|
|
file_options.compaction_readahead_size, allow_unprepared_value);
|
|
}
|
|
if (handle != nullptr) {
|
|
cache_.RegisterReleaseAsCleanup(handle, *result);
|
|
handle = nullptr; // prevent from releasing below
|
|
}
|
|
|
|
if (for_compaction) {
|
|
table_reader->SetupForCompaction();
|
|
}
|
|
if (table_reader_ptr != nullptr) {
|
|
*table_reader_ptr = table_reader;
|
|
}
|
|
}
|
|
if (s.ok() && !options.ignore_range_deletions) {
|
|
if (range_del_iter != nullptr) {
|
|
auto new_range_del_iter =
|
|
read_seqno ? table_reader->NewRangeTombstoneIterator(
|
|
*read_seqno, options.timestamp)
|
|
: table_reader->NewRangeTombstoneIterator(options);
|
|
if (new_range_del_iter == nullptr || new_range_del_iter->empty()) {
|
|
delete new_range_del_iter;
|
|
*range_del_iter = nullptr;
|
|
} else {
|
|
*range_del_iter = std::make_unique<TruncatedRangeDelIterator>(
|
|
std::unique_ptr<FragmentedRangeTombstoneIterator>(
|
|
new_range_del_iter),
|
|
&icomparator, &file_meta.smallest, &file_meta.largest);
|
|
}
|
|
}
|
|
if (range_del_agg != nullptr) {
|
|
if (range_del_agg->AddFile(fd.GetNumber())) {
|
|
std::unique_ptr<FragmentedRangeTombstoneIterator> new_range_del_iter(
|
|
static_cast<FragmentedRangeTombstoneIterator*>(
|
|
table_reader->NewRangeTombstoneIterator(options)));
|
|
if (new_range_del_iter != nullptr) {
|
|
s = new_range_del_iter->status();
|
|
}
|
|
if (s.ok()) {
|
|
const InternalKey* smallest = &file_meta.smallest;
|
|
const InternalKey* largest = &file_meta.largest;
|
|
if (smallest_compaction_key != nullptr) {
|
|
smallest = smallest_compaction_key;
|
|
}
|
|
if (largest_compaction_key != nullptr) {
|
|
largest = largest_compaction_key;
|
|
}
|
|
range_del_agg->AddTombstones(std::move(new_range_del_iter), smallest,
|
|
largest);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (handle != nullptr) {
|
|
cache_.Release(handle);
|
|
}
|
|
if (!s.ok()) {
|
|
assert(result == nullptr);
|
|
result = NewErrorInternalIterator<Slice>(s, arena);
|
|
}
|
|
return result;
|
|
}
|
|
|
|
Status TableCache::GetRangeTombstoneIterator(
|
|
const ReadOptions& options,
|
|
const InternalKeyComparator& internal_comparator,
|
|
const FileMetaData& file_meta, uint8_t block_protection_bytes_per_key,
|
|
std::unique_ptr<FragmentedRangeTombstoneIterator>* out_iter) {
|
|
assert(out_iter);
|
|
const FileDescriptor& fd = file_meta.fd;
|
|
Status s;
|
|
TableReader* t = fd.table_reader;
|
|
TypedHandle* handle = nullptr;
|
|
if (t == nullptr) {
|
|
s = FindTable(options, file_options_, internal_comparator, file_meta,
|
|
&handle, block_protection_bytes_per_key);
|
|
if (s.ok()) {
|
|
t = cache_.Value(handle);
|
|
}
|
|
}
|
|
if (s.ok()) {
|
|
// Note: NewRangeTombstoneIterator could return nullptr
|
|
out_iter->reset(t->NewRangeTombstoneIterator(options));
|
|
}
|
|
if (handle) {
|
|
if (*out_iter) {
|
|
cache_.RegisterReleaseAsCleanup(handle, **out_iter);
|
|
} else {
|
|
cache_.Release(handle);
|
|
}
|
|
}
|
|
return s;
|
|
}
|
|
|
|
uint64_t TableCache::CreateRowCacheKeyPrefix(const ReadOptions& options,
|
|
const FileDescriptor& fd,
|
|
const Slice& internal_key,
|
|
GetContext* get_context,
|
|
IterKey& row_cache_key) {
|
|
uint64_t fd_number = fd.GetNumber();
|
|
// We use the user key as cache key instead of the internal key,
|
|
// otherwise the whole cache would be invalidated every time the
|
|
// sequence key increases. However, to support caching snapshot
|
|
// reads, we append a sequence number (incremented by 1 to
|
|
// distinguish from 0) other than internal_key seq no
|
|
// to determine row cache entry visibility.
|
|
// If the snapshot is larger than the largest seqno in the file,
|
|
// all data should be exposed to the snapshot, so we treat it
|
|
// the same as there is no snapshot. The exception is that if
|
|
// a seq-checking callback is registered, some internal keys
|
|
// may still be filtered out.
|
|
uint64_t cache_entry_seq_no = 0;
|
|
|
|
// Maybe we can include the whole file ifsnapshot == fd.largest_seqno.
|
|
if (options.snapshot != nullptr &&
|
|
(get_context->has_callback() ||
|
|
static_cast_with_check<const SnapshotImpl>(options.snapshot)
|
|
->GetSequenceNumber() <= fd.largest_seqno)) {
|
|
// We should consider to use options.snapshot->GetSequenceNumber()
|
|
// instead of GetInternalKeySeqno(k), which will make the code
|
|
// easier to understand.
|
|
cache_entry_seq_no = 1 + GetInternalKeySeqno(internal_key);
|
|
}
|
|
|
|
// Compute row cache key.
|
|
row_cache_key.TrimAppend(row_cache_key.Size(), row_cache_id_.data(),
|
|
row_cache_id_.size());
|
|
AppendVarint64(&row_cache_key, fd_number);
|
|
AppendVarint64(&row_cache_key, cache_entry_seq_no);
|
|
|
|
// Provide a sequence number for callback checking on cache hit.
|
|
// As cache_entry_seq_no starts at 1, decrease it's value by 1 to get
|
|
// a sequence number align with get context's logic.
|
|
return cache_entry_seq_no == 0 ? 0 : cache_entry_seq_no - 1;
|
|
}
|
|
|
|
bool TableCache::GetFromRowCache(const Slice& user_key, IterKey& row_cache_key,
|
|
size_t prefix_size, GetContext* get_context,
|
|
Status* read_status, SequenceNumber seq_no) {
|
|
bool found = false;
|
|
|
|
row_cache_key.TrimAppend(prefix_size, user_key.data(), user_key.size());
|
|
RowCacheInterface row_cache{ioptions_.row_cache.get()};
|
|
if (auto row_handle = row_cache.Lookup(row_cache_key.GetUserKey())) {
|
|
// Cleanable routine to release the cache entry
|
|
Cleanable value_pinner;
|
|
// If it comes here value is located on the cache.
|
|
// found_row_cache_entry points to the value on cache,
|
|
// and value_pinner has cleanup procedure for the cached entry.
|
|
// After replayGetContextLog() returns, get_context.pinnable_slice_
|
|
// will point to cache entry buffer (or a copy based on that) and
|
|
// cleanup routine under value_pinner will be delegated to
|
|
// get_context.pinnable_slice_. Cache entry is released when
|
|
// get_context.pinnable_slice_ is reset.
|
|
row_cache.RegisterReleaseAsCleanup(row_handle, value_pinner);
|
|
// If row cache hit, knowing cache key is the same to row_cache_key,
|
|
// can use row_cache_key's seq no to construct InternalKey.
|
|
*read_status = replayGetContextLog(*row_cache.Value(row_handle), user_key,
|
|
get_context, &value_pinner, seq_no);
|
|
RecordTick(ioptions_.stats, ROW_CACHE_HIT);
|
|
found = true;
|
|
} else {
|
|
RecordTick(ioptions_.stats, ROW_CACHE_MISS);
|
|
}
|
|
return found;
|
|
}
|
|
|
|
Status TableCache::Get(
|
|
const ReadOptions& options,
|
|
const InternalKeyComparator& internal_comparator,
|
|
const FileMetaData& file_meta, const Slice& k, GetContext* get_context,
|
|
uint8_t block_protection_bytes_per_key,
|
|
const std::shared_ptr<const SliceTransform>& prefix_extractor,
|
|
HistogramImpl* file_read_hist, bool skip_filters, int level,
|
|
size_t max_file_size_for_l0_meta_pin) {
|
|
auto& fd = file_meta.fd;
|
|
std::string* row_cache_entry = nullptr;
|
|
bool done = false;
|
|
IterKey row_cache_key;
|
|
std::string row_cache_entry_buffer;
|
|
|
|
// Check row cache if enabled.
|
|
// Reuse row_cache_key sequence number when row cache hits.
|
|
Status s;
|
|
if (ioptions_.row_cache && !get_context->NeedToReadSequence()) {
|
|
auto user_key = ExtractUserKey(k);
|
|
uint64_t cache_entry_seq_no =
|
|
CreateRowCacheKeyPrefix(options, fd, k, get_context, row_cache_key);
|
|
done = GetFromRowCache(user_key, row_cache_key, row_cache_key.Size(),
|
|
get_context, &s, cache_entry_seq_no);
|
|
if (!done) {
|
|
row_cache_entry = &row_cache_entry_buffer;
|
|
}
|
|
}
|
|
TableReader* t = fd.table_reader;
|
|
TypedHandle* handle = nullptr;
|
|
if (s.ok() && !done) {
|
|
if (t == nullptr) {
|
|
s = FindTable(options, file_options_, internal_comparator, file_meta,
|
|
&handle, block_protection_bytes_per_key, prefix_extractor,
|
|
options.read_tier == kBlockCacheTier /* no_io */,
|
|
file_read_hist, skip_filters, level,
|
|
true /* prefetch_index_and_filter_in_cache */,
|
|
max_file_size_for_l0_meta_pin, file_meta.temperature);
|
|
if (s.ok()) {
|
|
t = cache_.Value(handle);
|
|
}
|
|
}
|
|
SequenceNumber* max_covering_tombstone_seq =
|
|
get_context->max_covering_tombstone_seq();
|
|
if (s.ok() && max_covering_tombstone_seq != nullptr &&
|
|
!options.ignore_range_deletions) {
|
|
std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
|
|
t->NewRangeTombstoneIterator(options));
|
|
if (range_del_iter != nullptr) {
|
|
SequenceNumber seq =
|
|
range_del_iter->MaxCoveringTombstoneSeqnum(ExtractUserKey(k));
|
|
if (seq > *max_covering_tombstone_seq) {
|
|
*max_covering_tombstone_seq = seq;
|
|
if (get_context->NeedTimestamp()) {
|
|
get_context->SetTimestampFromRangeTombstone(
|
|
range_del_iter->timestamp());
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if (s.ok()) {
|
|
get_context->SetReplayLog(row_cache_entry); // nullptr if no cache.
|
|
s = t->Get(options, k, get_context, prefix_extractor.get(), skip_filters);
|
|
get_context->SetReplayLog(nullptr);
|
|
} else if (options.read_tier == kBlockCacheTier && s.IsIncomplete()) {
|
|
// Couldn't find table in cache and couldn't open it because of no_io.
|
|
get_context->MarkKeyMayExist();
|
|
done = true;
|
|
}
|
|
}
|
|
|
|
// Put the replay log in row cache only if something was found.
|
|
if (!done && s.ok() && row_cache_entry && !row_cache_entry->empty()) {
|
|
RowCacheInterface row_cache{ioptions_.row_cache.get()};
|
|
size_t charge = row_cache_entry->capacity() + sizeof(std::string);
|
|
auto row_ptr = new std::string(std::move(*row_cache_entry));
|
|
Status rcs = row_cache.Insert(row_cache_key.GetUserKey(), row_ptr, charge);
|
|
if (!rcs.ok()) {
|
|
// If row cache is full, it's OK to continue, but we keep ownership of
|
|
// row_ptr.
|
|
delete row_ptr;
|
|
}
|
|
}
|
|
|
|
if (handle != nullptr) {
|
|
cache_.Release(handle);
|
|
}
|
|
return s;
|
|
}
|
|
|
|
void TableCache::UpdateRangeTombstoneSeqnums(
|
|
const ReadOptions& options, TableReader* t,
|
|
MultiGetContext::Range& table_range) {
|
|
std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
|
|
t->NewRangeTombstoneIterator(options));
|
|
if (range_del_iter != nullptr) {
|
|
for (auto iter = table_range.begin(); iter != table_range.end(); ++iter) {
|
|
SequenceNumber* max_covering_tombstone_seq =
|
|
iter->get_context->max_covering_tombstone_seq();
|
|
SequenceNumber seq =
|
|
range_del_iter->MaxCoveringTombstoneSeqnum(iter->ukey_with_ts);
|
|
if (seq > *max_covering_tombstone_seq) {
|
|
*max_covering_tombstone_seq = seq;
|
|
if (iter->get_context->NeedTimestamp()) {
|
|
iter->get_context->SetTimestampFromRangeTombstone(
|
|
range_del_iter->timestamp());
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
Status TableCache::MultiGetFilter(
|
|
const ReadOptions& options,
|
|
const InternalKeyComparator& internal_comparator,
|
|
const FileMetaData& file_meta,
|
|
const std::shared_ptr<const SliceTransform>& prefix_extractor,
|
|
HistogramImpl* file_read_hist, int level,
|
|
MultiGetContext::Range* mget_range, TypedHandle** table_handle,
|
|
uint8_t block_protection_bytes_per_key) {
|
|
auto& fd = file_meta.fd;
|
|
IterKey row_cache_key;
|
|
std::string row_cache_entry_buffer;
|
|
|
|
// Check if we need to use the row cache. If yes, then we cannot do the
|
|
// filtering here, since the filtering needs to happen after the row cache
|
|
// lookup.
|
|
KeyContext& first_key = *mget_range->begin();
|
|
if (ioptions_.row_cache && !first_key.get_context->NeedToReadSequence()) {
|
|
return Status::NotSupported();
|
|
}
|
|
Status s;
|
|
TableReader* t = fd.table_reader;
|
|
TypedHandle* handle = nullptr;
|
|
MultiGetContext::Range tombstone_range(*mget_range, mget_range->begin(),
|
|
mget_range->end());
|
|
if (t == nullptr) {
|
|
s = FindTable(options, file_options_, internal_comparator, file_meta,
|
|
&handle, block_protection_bytes_per_key, prefix_extractor,
|
|
options.read_tier == kBlockCacheTier /* no_io */,
|
|
file_read_hist,
|
|
/*skip_filters=*/false, level,
|
|
true /* prefetch_index_and_filter_in_cache */,
|
|
/*max_file_size_for_l0_meta_pin=*/0, file_meta.temperature);
|
|
if (s.ok()) {
|
|
t = cache_.Value(handle);
|
|
}
|
|
*table_handle = handle;
|
|
}
|
|
if (s.ok()) {
|
|
s = t->MultiGetFilter(options, prefix_extractor.get(), mget_range);
|
|
}
|
|
if (s.ok() && !options.ignore_range_deletions) {
|
|
// Update the range tombstone sequence numbers for the keys here
|
|
// as TableCache::MultiGet may or may not be called, and even if it
|
|
// is, it may be called with fewer keys in the rangedue to filtering.
|
|
UpdateRangeTombstoneSeqnums(options, t, tombstone_range);
|
|
}
|
|
if (mget_range->empty() && handle) {
|
|
cache_.Release(handle);
|
|
*table_handle = nullptr;
|
|
}
|
|
|
|
return s;
|
|
}
|
|
|
|
Status TableCache::GetTableProperties(
|
|
const FileOptions& file_options, const ReadOptions& read_options,
|
|
const InternalKeyComparator& internal_comparator,
|
|
const FileMetaData& file_meta,
|
|
std::shared_ptr<const TableProperties>* properties,
|
|
uint8_t block_protection_bytes_per_key,
|
|
const std::shared_ptr<const SliceTransform>& prefix_extractor, bool no_io) {
|
|
auto table_reader = file_meta.fd.table_reader;
|
|
// table already been pre-loaded?
|
|
if (table_reader) {
|
|
*properties = table_reader->GetTableProperties();
|
|
|
|
return Status::OK();
|
|
}
|
|
|
|
TypedHandle* table_handle = nullptr;
|
|
Status s = FindTable(read_options, file_options, internal_comparator,
|
|
file_meta, &table_handle, block_protection_bytes_per_key,
|
|
prefix_extractor, no_io);
|
|
if (!s.ok()) {
|
|
return s;
|
|
}
|
|
assert(table_handle);
|
|
auto table = cache_.Value(table_handle);
|
|
*properties = table->GetTableProperties();
|
|
cache_.Release(table_handle);
|
|
return s;
|
|
}
|
|
|
|
Status TableCache::ApproximateKeyAnchors(
|
|
const ReadOptions& ro, const InternalKeyComparator& internal_comparator,
|
|
const FileMetaData& file_meta, uint8_t block_protection_bytes_per_key,
|
|
std::vector<TableReader::Anchor>& anchors) {
|
|
Status s;
|
|
TableReader* t = file_meta.fd.table_reader;
|
|
TypedHandle* handle = nullptr;
|
|
if (t == nullptr) {
|
|
s = FindTable(ro, file_options_, internal_comparator, file_meta, &handle,
|
|
block_protection_bytes_per_key);
|
|
if (s.ok()) {
|
|
t = cache_.Value(handle);
|
|
}
|
|
}
|
|
if (s.ok() && t != nullptr) {
|
|
s = t->ApproximateKeyAnchors(ro, anchors);
|
|
}
|
|
if (handle != nullptr) {
|
|
cache_.Release(handle);
|
|
}
|
|
return s;
|
|
}
|
|
|
|
size_t TableCache::GetMemoryUsageByTableReader(
|
|
const FileOptions& file_options, const ReadOptions& read_options,
|
|
const InternalKeyComparator& internal_comparator,
|
|
const FileMetaData& file_meta, uint8_t block_protection_bytes_per_key,
|
|
const std::shared_ptr<const SliceTransform>& prefix_extractor) {
|
|
auto table_reader = file_meta.fd.table_reader;
|
|
// table already been pre-loaded?
|
|
if (table_reader) {
|
|
return table_reader->ApproximateMemoryUsage();
|
|
}
|
|
|
|
TypedHandle* table_handle = nullptr;
|
|
Status s = FindTable(read_options, file_options, internal_comparator,
|
|
file_meta, &table_handle, block_protection_bytes_per_key,
|
|
prefix_extractor, true /* no_io */);
|
|
if (!s.ok()) {
|
|
return 0;
|
|
}
|
|
assert(table_handle);
|
|
auto table = cache_.Value(table_handle);
|
|
auto ret = table->ApproximateMemoryUsage();
|
|
cache_.Release(table_handle);
|
|
return ret;
|
|
}
|
|
|
|
void TableCache::Evict(Cache* cache, uint64_t file_number) {
|
|
cache->Erase(GetSliceForFileNumber(&file_number));
|
|
}
|
|
|
|
uint64_t TableCache::ApproximateOffsetOf(
|
|
const ReadOptions& read_options, const Slice& key,
|
|
const FileMetaData& file_meta, TableReaderCaller caller,
|
|
const InternalKeyComparator& internal_comparator,
|
|
uint8_t block_protection_bytes_per_key,
|
|
const std::shared_ptr<const SliceTransform>& prefix_extractor) {
|
|
uint64_t result = 0;
|
|
TableReader* table_reader = file_meta.fd.table_reader;
|
|
TypedHandle* table_handle = nullptr;
|
|
if (table_reader == nullptr) {
|
|
Status s =
|
|
FindTable(read_options, file_options_, internal_comparator, file_meta,
|
|
&table_handle, block_protection_bytes_per_key,
|
|
prefix_extractor, false /* no_io */);
|
|
if (s.ok()) {
|
|
table_reader = cache_.Value(table_handle);
|
|
}
|
|
}
|
|
|
|
if (table_reader != nullptr) {
|
|
result = table_reader->ApproximateOffsetOf(read_options, key, caller);
|
|
}
|
|
if (table_handle != nullptr) {
|
|
cache_.Release(table_handle);
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
uint64_t TableCache::ApproximateSize(
|
|
const ReadOptions& read_options, const Slice& start, const Slice& end,
|
|
const FileMetaData& file_meta, TableReaderCaller caller,
|
|
const InternalKeyComparator& internal_comparator,
|
|
uint8_t block_protection_bytes_per_key,
|
|
const std::shared_ptr<const SliceTransform>& prefix_extractor) {
|
|
uint64_t result = 0;
|
|
TableReader* table_reader = file_meta.fd.table_reader;
|
|
TypedHandle* table_handle = nullptr;
|
|
if (table_reader == nullptr) {
|
|
Status s =
|
|
FindTable(read_options, file_options_, internal_comparator, file_meta,
|
|
&table_handle, block_protection_bytes_per_key,
|
|
prefix_extractor, false /* no_io */);
|
|
if (s.ok()) {
|
|
table_reader = cache_.Value(table_handle);
|
|
}
|
|
}
|
|
|
|
if (table_reader != nullptr) {
|
|
result = table_reader->ApproximateSize(read_options, start, end, caller);
|
|
}
|
|
if (table_handle != nullptr) {
|
|
cache_.Release(table_handle);
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
void TableCache::ReleaseObsolete(Cache* cache, Cache::Handle* h,
|
|
uint32_t uncache_aggressiveness) {
|
|
CacheInterface typed_cache(cache);
|
|
TypedHandle* table_handle = reinterpret_cast<TypedHandle*>(h);
|
|
TableReader* table_reader = typed_cache.Value(table_handle);
|
|
table_reader->MarkObsolete(uncache_aggressiveness);
|
|
typed_cache.ReleaseAndEraseIfLastRef(table_handle);
|
|
}
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|