mirror of
https://github.com/facebook/rocksdb.git
synced 2024-12-01 07:15:51 +00:00
bf98dcf9a8
Summary: The original goal is to propagate failures from `GetContext::SaveValue()` -> `GetContext::GetBlobValue()` -> `BlobFetcher::FetchBlob()` up to the user. This call sequence happens when a merge chain ends with a base value in a blob file. There's also fixes for bugs encountered along the way where non-ok statuses were ignored/overwritten, and a bit of plumbing work for functions that had no capability to return a status. Pull Request resolved: https://github.com/facebook/rocksdb/pull/12462 Test Plan: A repro command ``` db=/dev/shm/dbstress_db ; exp=/dev/shm/dbstress_exp ; rm -rf $db $exp ; mkdir -p $db $exp ./db_stress \ --clear_column_family_one_in=0 \ --test_batches_snapshots=0 \ --write_fault_one_in=0 \ --use_put_entity_one_in=0 \ --prefixpercent=0 \ --read_fault_one_in=0 \ --readpercent=0 \ --reopen=0 \ --set_options_one_in=10000 \ --delpercent=0 \ --delrangepercent=0 \ --open_metadata_write_fault_one_in=0 \ --open_read_fault_one_in=0 \ --open_write_fault_one_in=0 \ --destroy_db_initially=0 \ --ingest_external_file_one_in=0 \ --iterpercent=0 \ --nooverwritepercent=0 \ --db=$db \ --enable_blob_files=1 \ --expected_values_dir=$exp \ --max_background_compactions=20 \ --max_bytes_for_level_base=2097152 \ --max_key=100000 \ --min_blob_size=0 \ --open_files=-1 \ --ops_per_thread=100000000 \ --prefix_size=-1 \ --target_file_size_base=524288 \ --use_merge=1 \ --value_size_mult=32 \ --write_buffer_size=524288 \ --writepercent=100 ``` It used to fail like: ``` ... frame https://github.com/facebook/rocksdb/issues/9: 0x00007fc63903bc93 libc.so.6`__GI___assert_fail(assertion="HasDefaultColumn(columns)", file="fbcode/internal_repo_rocksdb/repo/db/wide/wide_columns_helper.h", line=33, function="static const rocksdb::Slice &rocksdb::WideColumnsHelper::GetDefaultColumn(const rocksdb::WideColumns &)") at assert.c:101:3 frame https://github.com/facebook/rocksdb/issues/10: 0x00000000006f7e92 db_stress`rocksdb::Version::Get(rocksdb::ReadOptions const&, rocksdb::LookupKey const&, rocksdb::PinnableSlice*, rocksdb::PinnableWideColumns*, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char>>*, rocksdb::Status*, rocksdb::MergeContext*, unsigned long*, rocksdb::PinnedIteratorsManager*, bool*, bool*, unsigned long*, rocksdb::ReadCallback*, bool*, bool) [inlined] rocksdb::WideColumnsHelper::GetDefaultColumn(columns=size=0) at wide_columns_helper.h:33 frame https://github.com/facebook/rocksdb/issues/11: 0x00000000006f7e76 db_stress`rocksdb::Version::Get(this=0x00007fc5ec763000, read_options=<unavailable>, k=<unavailable>, value=0x0000000000000000, columns=0x00007fc6035fd1d8, timestamp=<unavailable>, status=0x00007fc6035fd250, merge_context=0x00007fc6035fce40, max_covering_tombstone_seq=0x00007fc6035fce90, pinned_iters_mgr=0x00007fc6035fcdf0, value_found=0x0000000000000000, key_exists=0x0000000000000000, seq=0x0000000000000000, callback=0x0000000000000000, is_blob=0x0000000000000000, do_merge=<unavailable>) at version_set.cc:2492 frame https://github.com/facebook/rocksdb/issues/12: 0x000000000051e245 db_stress`rocksdb::DBImpl::GetImpl(this=0x00007fc637a86000, read_options=0x00007fc6035fcf60, key=<unavailable>, get_impl_options=0x00007fc6035fd000) at db_impl.cc:2408 frame https://github.com/facebook/rocksdb/issues/13: 0x000000000050cec2 db_stress`rocksdb::DBImpl::GetEntity(this=0x00007fc637a86000, _read_options=<unavailable>, column_family=<unavailable>, key=0x00007fc6035fd3c8, columns=0x00007fc6035fd1d8) at db_impl.cc:2109 frame https://github.com/facebook/rocksdb/issues/14: 0x000000000074f688 db_stress`rocksdb::(anonymous namespace)::MemTableInserter::MergeCF(this=0x00007fc6035fd450, column_family_id=2, key=0x00007fc6035fd3c8, value=0x00007fc6035fd3a0) at write_batch.cc:2656 frame https://github.com/facebook/rocksdb/issues/15: 0x00000000007476fc db_stress`rocksdb::WriteBatchInternal::Iterate(wb=0x00007fc6035fe698, handler=0x00007fc6035fd450, begin=12, end=<unavailable>) at write_batch.cc:607 frame https://github.com/facebook/rocksdb/issues/16: 0x000000000074d7dd db_stress`rocksdb::WriteBatchInternal::InsertInto(rocksdb::WriteThread::WriteGroup&, unsigned long, rocksdb::ColumnFamilyMemTables*, rocksdb::FlushScheduler*, rocksdb::TrimHistoryScheduler*, bool, unsigned long, rocksdb::DB*, bool, bool, bool) [inlined] rocksdb::WriteBatch::Iterate(this=<unavailable>, handler=0x00007fc6035fd450) const at write_batch.cc:505 frame https://github.com/facebook/rocksdb/issues/17: 0x000000000074d77b db_stress`rocksdb::WriteBatchInternal::InsertInto(write_group=<unavailable>, sequence=<unavailable>, memtables=<unavailable>, flush_scheduler=<unavailable>, trim_history_scheduler=<unavailable>, ignore_missing_column_families=<unavailable>, recovery_log_number=0, db=0x00007fc637a86000, concurrent_memtable_writes=<unavailable>, seq_per_batch=false, batch_per_txn=<unavailable>) at write_batch.cc:3084 frame https://github.com/facebook/rocksdb/issues/18: 0x0000000000631d77 db_stress`rocksdb::DBImpl::PipelinedWriteImpl(this=0x00007fc637a86000, write_options=<unavailable>, my_batch=0x00007fc6035fe698, callback=0x0000000000000000, log_used=<unavailable>, log_ref=0, disable_memtable=<unavailable>, seq_used=0x0000000000000000) at db_impl_write.cc:807 frame https://github.com/facebook/rocksdb/issues/19: 0x000000000062ceeb db_stress`rocksdb::DBImpl::WriteImpl(this=<unavailable>, write_options=<unavailable>, my_batch=0x00007fc6035fe698, callback=0x0000000000000000, log_used=<unavailable>, log_ref=0, disable_memtable=<unavailable>, seq_used=0x0000000000000000, batch_cnt=0, pre_release_callback=0x0000000000000000, post_memtable_callback=0x0000000000000000) at db_impl_write.cc:312 frame https://github.com/facebook/rocksdb/issues/20: 0x000000000062c8ec db_stress`rocksdb::DBImpl::Write(this=0x00007fc637a86000, write_options=0x00007fc6035feca8, my_batch=0x00007fc6035fe698) at db_impl_write.cc:157 frame https://github.com/facebook/rocksdb/issues/21: 0x000000000062b847 db_stress`rocksdb::DB::Merge(this=0x00007fc637a86000, opt=0x00007fc6035feca8, column_family=0x00007fc6370bf140, key=0x00007fc6035fe8d8, value=0x00007fc6035fe830) at db_impl_write.cc:2544 frame https://github.com/facebook/rocksdb/issues/22: 0x000000000062b6ef db_stress`rocksdb::DBImpl::Merge(this=0x00007fc637a86000, o=<unavailable>, column_family=0x00007fc6370bf140, key=0x00007fc6035fe8d8, val=0x00007fc6035fe830) at db_impl_write.cc:72 frame https://github.com/facebook/rocksdb/issues/23: 0x00000000004d6397 db_stress`rocksdb::NonBatchedOpsStressTest::TestPut(this=0x00007fc637041000, thread=0x00007fc6370dbc00, write_opts=0x00007fc6035feca8, read_opts=0x00007fc6035fe9c8, rand_column_families=<unavailable>, rand_keys=size=1, value={P\xe9_\x03\xc6\x7f\0\0}) at no_batched_ops_stress.cc:1317 frame https://github.com/facebook/rocksdb/issues/24: 0x000000000049361d db_stress`rocksdb::StressTest::OperateDb(this=0x00007fc637041000, thread=0x00007fc6370dbc00) at db_stress_test_base.cc:1148 ... ``` Reviewed By: ltamasi Differential Revision: D55157795 Pulled By: ajkr fbshipit-source-id: 5f7c1380ead5794c29d41680028e34b839744764
291 lines
14 KiB
C++
291 lines
14 KiB
C++
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
//
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
//
|
|
// Thread-safe (provides internal synchronization)
|
|
|
|
#pragma once
|
|
#include <cstdint>
|
|
#include <string>
|
|
#include <vector>
|
|
|
|
#include "cache/typed_cache.h"
|
|
#include "db/dbformat.h"
|
|
#include "db/range_del_aggregator.h"
|
|
#include "options/cf_options.h"
|
|
#include "port/port.h"
|
|
#include "rocksdb/cache.h"
|
|
#include "rocksdb/env.h"
|
|
#include "rocksdb/options.h"
|
|
#include "rocksdb/table.h"
|
|
#include "table/table_reader.h"
|
|
#include "trace_replay/block_cache_tracer.h"
|
|
#include "util/coro_utils.h"
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
class Env;
|
|
class Arena;
|
|
struct FileDescriptor;
|
|
class GetContext;
|
|
class HistogramImpl;
|
|
|
|
// Manages caching for TableReader objects for a column family. The actual
|
|
// cache is allocated separately and passed to the constructor. TableCache
|
|
// wraps around the underlying SST file readers by providing Get(),
|
|
// MultiGet() and NewIterator() methods that hide the instantiation,
|
|
// caching and access to the TableReader. The main purpose of this is
|
|
// performance - by caching the TableReader, it avoids unnecessary file opens
|
|
// and object allocation and instantiation. One exception is compaction, where
|
|
// a new TableReader may be instantiated - see NewIterator() comments
|
|
//
|
|
// Another service provided by TableCache is managing the row cache - if the
|
|
// DB is configured with a row cache, and the lookup key is present in the row
|
|
// cache, lookup is very fast. The row cache is obtained from
|
|
// ioptions.row_cache
|
|
class TableCache {
|
|
public:
|
|
TableCache(const ImmutableOptions& ioptions,
|
|
const FileOptions* storage_options, Cache* cache,
|
|
BlockCacheTracer* const block_cache_tracer,
|
|
const std::shared_ptr<IOTracer>& io_tracer,
|
|
const std::string& db_session_id);
|
|
~TableCache();
|
|
|
|
// Cache interface for table cache
|
|
using CacheInterface =
|
|
BasicTypedCacheInterface<TableReader, CacheEntryRole::kMisc>;
|
|
using TypedHandle = CacheInterface::TypedHandle;
|
|
|
|
// Cache interface for row cache
|
|
using RowCacheInterface =
|
|
BasicTypedCacheInterface<std::string, CacheEntryRole::kMisc>;
|
|
using RowHandle = RowCacheInterface::TypedHandle;
|
|
|
|
// Return an iterator for the specified file number (the corresponding
|
|
// file length must be exactly "file_size" bytes). If "table_reader_ptr"
|
|
// is non-nullptr, also sets "*table_reader_ptr" to point to the Table object
|
|
// underlying the returned iterator, or nullptr if no Table object underlies
|
|
// the returned iterator. The returned "*table_reader_ptr" object is owned
|
|
// by the cache and should not be deleted, and is valid for as long as the
|
|
// returned iterator is live.
|
|
// If !options.ignore_range_deletions, and range_del_iter is non-nullptr,
|
|
// then range_del_iter is set to a TruncatedRangeDelIterator for range
|
|
// tombstones in the SST file corresponding to the specified file number. The
|
|
// upper/lower bounds for the TruncatedRangeDelIterator are set to the SST
|
|
// file's boundary.
|
|
// @param options Must outlive the returned iterator.
|
|
// @param range_del_agg If non-nullptr, adds range deletions to the
|
|
// aggregator. If an error occurs, returns it in a NewErrorInternalIterator
|
|
// @param for_compaction If true, a new TableReader may be allocated (but
|
|
// not cached), depending on the CF options
|
|
// @param skip_filters Disables loading/accessing the filter block
|
|
// @param level The level this table is at, -1 for "not set / don't know"
|
|
// @param range_del_read_seqno If non-nullptr, will be used to create
|
|
// *range_del_iter.
|
|
InternalIterator* NewIterator(
|
|
const ReadOptions& options, const FileOptions& toptions,
|
|
const InternalKeyComparator& internal_comparator,
|
|
const FileMetaData& file_meta, RangeDelAggregator* range_del_agg,
|
|
const std::shared_ptr<const SliceTransform>& prefix_extractor,
|
|
TableReader** table_reader_ptr, HistogramImpl* file_read_hist,
|
|
TableReaderCaller caller, Arena* arena, bool skip_filters, int level,
|
|
size_t max_file_size_for_l0_meta_pin,
|
|
const InternalKey* smallest_compaction_key,
|
|
const InternalKey* largest_compaction_key, bool allow_unprepared_value,
|
|
uint8_t protection_bytes_per_key,
|
|
const SequenceNumber* range_del_read_seqno = nullptr,
|
|
TruncatedRangeDelIterator** range_del_iter = nullptr);
|
|
|
|
// If a seek to internal key "k" in specified file finds an entry,
|
|
// call get_context->SaveValue() repeatedly until
|
|
// it returns false. As a side effect, it will insert the TableReader
|
|
// into the cache and potentially evict another entry
|
|
// @param get_context Context for get operation. The result of the lookup
|
|
// can be retrieved by calling get_context->State()
|
|
// @param file_read_hist If non-nullptr, the file reader statistics are
|
|
// recorded
|
|
// @param skip_filters Disables loading/accessing the filter block
|
|
// @param level The level this table is at, -1 for "not set / don't know"
|
|
Status Get(
|
|
const ReadOptions& options,
|
|
const InternalKeyComparator& internal_comparator,
|
|
const FileMetaData& file_meta, const Slice& k, GetContext* get_context,
|
|
uint8_t block_protection_bytes_per_key,
|
|
const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr,
|
|
HistogramImpl* file_read_hist = nullptr, bool skip_filters = false,
|
|
int level = -1, size_t max_file_size_for_l0_meta_pin = 0);
|
|
|
|
// Return the range delete tombstone iterator of the file specified by
|
|
// `file_meta`.
|
|
Status GetRangeTombstoneIterator(
|
|
const ReadOptions& options,
|
|
const InternalKeyComparator& internal_comparator,
|
|
const FileMetaData& file_meta, uint8_t block_protection_bytes_per_key,
|
|
std::unique_ptr<FragmentedRangeTombstoneIterator>* out_iter);
|
|
|
|
// Call table reader's MultiGetFilter to use the bloom filter to filter out
|
|
// keys. Returns Status::NotSupported() if row cache needs to be checked.
|
|
// If the table cache is looked up to get the table reader, the cache handle
|
|
// is returned in table_handle. This handle should be passed back to
|
|
// MultiGet() so it can be released.
|
|
Status MultiGetFilter(
|
|
const ReadOptions& options,
|
|
const InternalKeyComparator& internal_comparator,
|
|
const FileMetaData& file_meta,
|
|
const std::shared_ptr<const SliceTransform>& prefix_extractor,
|
|
HistogramImpl* file_read_hist, int level,
|
|
MultiGetContext::Range* mget_range, TypedHandle** table_handle,
|
|
uint8_t block_protection_bytes_per_key);
|
|
|
|
// If a seek to internal key "k" in specified file finds an entry,
|
|
// call get_context->SaveValue() repeatedly until
|
|
// it returns false. As a side effect, it will insert the TableReader
|
|
// into the cache and potentially evict another entry
|
|
// @param mget_range Pointer to the structure describing a batch of keys to
|
|
// be looked up in this table file. The result is stored
|
|
// in the embedded GetContext
|
|
// @param skip_filters Disables loading/accessing the filter block
|
|
// @param level The level this table is at, -1 for "not set / don't know"
|
|
DECLARE_SYNC_AND_ASYNC(
|
|
Status, MultiGet, const ReadOptions& options,
|
|
const InternalKeyComparator& internal_comparator,
|
|
const FileMetaData& file_meta, const MultiGetContext::Range* mget_range,
|
|
uint8_t block_protection_bytes_per_key,
|
|
const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr,
|
|
HistogramImpl* file_read_hist = nullptr, bool skip_filters = false,
|
|
bool skip_range_deletions = false, int level = -1,
|
|
TypedHandle* table_handle = nullptr);
|
|
|
|
// Evict any entry for the specified file number
|
|
static void Evict(Cache* cache, uint64_t file_number);
|
|
|
|
// Find table reader
|
|
// @param skip_filters Disables loading/accessing the filter block
|
|
// @param level == -1 means not specified
|
|
Status FindTable(
|
|
const ReadOptions& ro, const FileOptions& toptions,
|
|
const InternalKeyComparator& internal_comparator,
|
|
const FileMetaData& file_meta, TypedHandle**,
|
|
uint8_t block_protection_bytes_per_key,
|
|
const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr,
|
|
const bool no_io = false, HistogramImpl* file_read_hist = nullptr,
|
|
bool skip_filters = false, int level = -1,
|
|
bool prefetch_index_and_filter_in_cache = true,
|
|
size_t max_file_size_for_l0_meta_pin = 0,
|
|
Temperature file_temperature = Temperature::kUnknown);
|
|
|
|
// Get the table properties of a given table.
|
|
// @no_io: indicates if we should load table to the cache if it is not present
|
|
// in table cache yet.
|
|
// @returns: `properties` will be reset on success. Please note that we will
|
|
// return Status::Incomplete() if table is not present in cache and
|
|
// we set `no_io` to be true.
|
|
Status GetTableProperties(
|
|
const FileOptions& toptions, const ReadOptions& read_options,
|
|
const InternalKeyComparator& internal_comparator,
|
|
const FileMetaData& file_meta,
|
|
std::shared_ptr<const TableProperties>* properties,
|
|
uint8_t block_protection_bytes_per_key,
|
|
const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr,
|
|
bool no_io = false);
|
|
|
|
Status ApproximateKeyAnchors(const ReadOptions& ro,
|
|
const InternalKeyComparator& internal_comparator,
|
|
const FileMetaData& file_meta,
|
|
uint8_t block_protection_bytes_per_key,
|
|
std::vector<TableReader::Anchor>& anchors);
|
|
|
|
// Return total memory usage of the table reader of the file.
|
|
// 0 if table reader of the file is not loaded.
|
|
size_t GetMemoryUsageByTableReader(
|
|
const FileOptions& toptions, const ReadOptions& read_options,
|
|
const InternalKeyComparator& internal_comparator,
|
|
const FileMetaData& file_meta, uint8_t block_protection_bytes_per_key,
|
|
const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr);
|
|
|
|
// Returns approximated offset of a key in a file represented by fd.
|
|
uint64_t ApproximateOffsetOf(
|
|
const ReadOptions& read_options, const Slice& key,
|
|
const FileMetaData& file_meta, TableReaderCaller caller,
|
|
const InternalKeyComparator& internal_comparator,
|
|
uint8_t block_protection_bytes_per_key,
|
|
const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr);
|
|
|
|
// Returns approximated data size between start and end keys in a file
|
|
// represented by fd (the start key must not be greater than the end key).
|
|
uint64_t ApproximateSize(
|
|
const ReadOptions& read_options, const Slice& start, const Slice& end,
|
|
const FileMetaData& file_meta, TableReaderCaller caller,
|
|
const InternalKeyComparator& internal_comparator,
|
|
uint8_t block_protection_bytes_per_key,
|
|
const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr);
|
|
|
|
CacheInterface& get_cache() { return cache_; }
|
|
|
|
// Capacity of the backing Cache that indicates infinite TableCache capacity.
|
|
// For example when max_open_files is -1 we set the backing Cache to this.
|
|
static const int kInfiniteCapacity = 0x400000;
|
|
|
|
// The tables opened with this TableCache will be immortal, i.e., their
|
|
// lifetime is as long as that of the DB.
|
|
void SetTablesAreImmortal() {
|
|
if (cache_.get()->GetCapacity() >= kInfiniteCapacity) {
|
|
immortal_tables_ = true;
|
|
}
|
|
}
|
|
|
|
private:
|
|
// Build a table reader
|
|
Status GetTableReader(
|
|
const ReadOptions& ro, const FileOptions& file_options,
|
|
const InternalKeyComparator& internal_comparator,
|
|
const FileMetaData& file_meta, bool sequential_mode,
|
|
uint8_t block_protection_bytes_per_key, HistogramImpl* file_read_hist,
|
|
std::unique_ptr<TableReader>* table_reader,
|
|
const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr,
|
|
bool skip_filters = false, int level = -1,
|
|
bool prefetch_index_and_filter_in_cache = true,
|
|
size_t max_file_size_for_l0_meta_pin = 0,
|
|
Temperature file_temperature = Temperature::kUnknown);
|
|
|
|
// Update the max_covering_tombstone_seq in the GetContext for each key based
|
|
// on the range deletions in the table
|
|
void UpdateRangeTombstoneSeqnums(const ReadOptions& options, TableReader* t,
|
|
MultiGetContext::Range& table_range);
|
|
|
|
// Create a key prefix for looking up the row cache. The prefix is of the
|
|
// format row_cache_id + fd_number + seq_no. Later, the user key can be
|
|
// appended to form the full key
|
|
// Return the sequence number that determines the visibility of row_cache_key
|
|
uint64_t CreateRowCacheKeyPrefix(const ReadOptions& options,
|
|
const FileDescriptor& fd,
|
|
const Slice& internal_key,
|
|
GetContext* get_context,
|
|
IterKey& row_cache_key);
|
|
|
|
// Helper function to lookup the row cache for a key. It appends the
|
|
// user key to row_cache_key at offset prefix_size
|
|
bool GetFromRowCache(const Slice& user_key, IterKey& row_cache_key,
|
|
size_t prefix_size, GetContext* get_context,
|
|
Status* read_status,
|
|
SequenceNumber seq_no = kMaxSequenceNumber);
|
|
|
|
const ImmutableOptions& ioptions_;
|
|
const FileOptions& file_options_;
|
|
CacheInterface cache_;
|
|
std::string row_cache_id_;
|
|
bool immortal_tables_;
|
|
BlockCacheTracer* const block_cache_tracer_;
|
|
Striped<CacheAlignedWrapper<port::Mutex>> loader_mutex_;
|
|
std::shared_ptr<IOTracer> io_tracer_;
|
|
std::string db_session_id_;
|
|
};
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|