mirror of
https://github.com/facebook/rocksdb.git
synced 2024-11-28 15:33:54 +00:00
3093d98c78
Summary: **Context:** [PR11406](https://github.com/facebook/rocksdb/pull/11406/) caused more frequent read during db open reading files with no `tail_size` in the manifest as part of the upgrade to 11406. This is due to that PR introduced - [smaller](https://github.com/facebook/rocksdb/pull/11406/files#diff-57ed8c49db2bdd4db7618646a177397674bbf25beacacecb104070071d30129fR833) prefetch tail buffer size compared to pre-11406 for small files (< 52 MB) when `tail_prefetch_stats` infers tail size to be 0 (usually happens when the stats does not have much historical data to infer early on) - more read (up to # of partitioned filter/index) when such small prefetch tail buffer does not contain all the partitioned filter/index needed in CacheDependencies() since the [fallback logic](https://github.com/facebook/rocksdb/pull/11406/files#diff-d98f1a83de24412ad7f3527725dae7e28851c7222622c3cdb832d3cdf24bbf9fR165-R179) that prefetches all partitions at once will be [skipped](url) when such a small prefetch tail buffer is passed in **Summary:** - Revert the fallback prefetch buffer size change to preserve existing behavior fully during upgrading in `BlockBasedTable::PrefetchTail()` - Use passed-in prefetch tail buffer in `CacheDependencies()` only if it has a smaller offset than the the offset of first partition filter/index, that is, at least as good as the existing prefetching behavior Pull Request resolved: https://github.com/facebook/rocksdb/pull/11516 Test Plan: - db bench Create db with small files prior to PR 11406 ``` ./db_bench -db=/tmp/testdb/ --partition_index_and_filters=1 --statistics=1 -benchmarks=fillseq -key_size=3200 -value_size=5 -num=1000000 -write_buffer_size=6550000 -disable_auto_compactions=true -compression_type=zstd` ``` Read db to see if post-pr has lower read qps (i.e, rocksdb.file.read.db.open.micros count) during db open. ``` ./db_bench -use_direct_reads=1 --file_opening_threads=1 --threads=1 --use_existing_db=1 --seed=1682546046158958 --partition_index_and_filters=1 --statistics=1 --db=/tmp/testdb/ --benchmarks=readrandom --key_size=3200 --value_size=5 --num=100 --disable_auto_compactions=true --compression_type=zstd ``` Pre-PR: ``` rocksdb.file.read.db.open.micros P50 : 3.399023 P95 : 5.924468 P99 : 12.408333 P100 : 29.000000 COUNT : 611 SUM : 2539 ``` Post-PR: ``` rocksdb.file.read.db.open.micros P50 : 593.736842 P95 : 861.605263 P99 : 1212.868421 P100 : 2663.000000 COUNT : 585 SUM : 345349 ``` _Note: To control the starting offset of the prefetch tail buffer easier, I manually override the following to eliminate the effect of alignment_ ``` class PosixRandomAccessFile : public FSRandomAccessFile { virtual size_t GetRequiredBufferAlignment() const override { - return logical_sector_size_; + return 1; } ``` - CI Reviewed By: pdillinger Differential Revision: D46472566 Pulled By: hx235 fbshipit-source-id: 2fe14ac8d489d15b0e08e6f8fe4f46d5f110978e
228 lines
8.4 KiB
C++
228 lines
8.4 KiB
C++
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
//
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
#include "table/block_based/partitioned_index_reader.h"
|
|
|
|
#include "block_cache.h"
|
|
#include "file/random_access_file_reader.h"
|
|
#include "table/block_based/block_based_table_reader.h"
|
|
#include "table/block_based/partitioned_index_iterator.h"
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
Status PartitionIndexReader::Create(
|
|
const BlockBasedTable* table, const ReadOptions& ro,
|
|
FilePrefetchBuffer* prefetch_buffer, bool use_cache, bool prefetch,
|
|
bool pin, BlockCacheLookupContext* lookup_context,
|
|
std::unique_ptr<IndexReader>* index_reader) {
|
|
assert(table != nullptr);
|
|
assert(table->get_rep());
|
|
assert(!pin || prefetch);
|
|
assert(index_reader != nullptr);
|
|
|
|
CachableEntry<Block> index_block;
|
|
if (prefetch || !use_cache) {
|
|
const Status s =
|
|
ReadIndexBlock(table, prefetch_buffer, ro, use_cache,
|
|
/*get_context=*/nullptr, lookup_context, &index_block);
|
|
if (!s.ok()) {
|
|
return s;
|
|
}
|
|
|
|
if (use_cache && !pin) {
|
|
index_block.Reset();
|
|
}
|
|
}
|
|
|
|
index_reader->reset(new PartitionIndexReader(table, std::move(index_block)));
|
|
|
|
return Status::OK();
|
|
}
|
|
|
|
InternalIteratorBase<IndexValue>* PartitionIndexReader::NewIterator(
|
|
const ReadOptions& read_options, bool /* disable_prefix_seek */,
|
|
IndexBlockIter* iter, GetContext* get_context,
|
|
BlockCacheLookupContext* lookup_context) {
|
|
const bool no_io = (read_options.read_tier == kBlockCacheTier);
|
|
CachableEntry<Block> index_block;
|
|
const Status s = GetOrReadIndexBlock(no_io, get_context, lookup_context,
|
|
&index_block, read_options);
|
|
if (!s.ok()) {
|
|
if (iter != nullptr) {
|
|
iter->Invalidate(s);
|
|
return iter;
|
|
}
|
|
|
|
return NewErrorInternalIterator<IndexValue>(s);
|
|
}
|
|
|
|
const BlockBasedTable::Rep* rep = table()->rep_;
|
|
InternalIteratorBase<IndexValue>* it = nullptr;
|
|
|
|
Statistics* kNullStats = nullptr;
|
|
// Filters are already checked before seeking the index
|
|
if (!partition_map_.empty()) {
|
|
// We don't return pinned data from index blocks, so no need
|
|
// to set `block_contents_pinned`.
|
|
it = NewTwoLevelIterator(
|
|
new BlockBasedTable::PartitionedIndexIteratorState(table(),
|
|
&partition_map_),
|
|
index_block.GetValue()->NewIndexIterator(
|
|
internal_comparator()->user_comparator(),
|
|
rep->get_global_seqno(BlockType::kIndex), nullptr, kNullStats, true,
|
|
index_has_first_key(), index_key_includes_seq(),
|
|
index_value_is_full(), false /* block_contents_pinned */,
|
|
user_defined_timestamps_persisted()));
|
|
} else {
|
|
ReadOptions ro;
|
|
ro.fill_cache = read_options.fill_cache;
|
|
ro.deadline = read_options.deadline;
|
|
ro.io_timeout = read_options.io_timeout;
|
|
ro.adaptive_readahead = read_options.adaptive_readahead;
|
|
ro.async_io = read_options.async_io;
|
|
ro.rate_limiter_priority = read_options.rate_limiter_priority;
|
|
ro.verify_checksums = read_options.verify_checksums;
|
|
ro.io_activity = read_options.io_activity;
|
|
|
|
// We don't return pinned data from index blocks, so no need
|
|
// to set `block_contents_pinned`.
|
|
std::unique_ptr<InternalIteratorBase<IndexValue>> index_iter(
|
|
index_block.GetValue()->NewIndexIterator(
|
|
internal_comparator()->user_comparator(),
|
|
rep->get_global_seqno(BlockType::kIndex), nullptr, kNullStats, true,
|
|
index_has_first_key(), index_key_includes_seq(),
|
|
index_value_is_full(), false /* block_contents_pinned */,
|
|
user_defined_timestamps_persisted()));
|
|
|
|
it = new PartitionedIndexIterator(
|
|
table(), ro, *internal_comparator(), std::move(index_iter),
|
|
lookup_context ? lookup_context->caller
|
|
: TableReaderCaller::kUncategorized);
|
|
}
|
|
|
|
assert(it != nullptr);
|
|
index_block.TransferTo(it);
|
|
|
|
return it;
|
|
|
|
// TODO(myabandeh): Update TwoLevelIterator to be able to make use of
|
|
// on-stack BlockIter while the state is on heap. Currentlly it assumes
|
|
// the first level iter is always on heap and will attempt to delete it
|
|
// in its destructor.
|
|
}
|
|
Status PartitionIndexReader::CacheDependencies(
|
|
const ReadOptions& ro, bool pin, FilePrefetchBuffer* tail_prefetch_buffer) {
|
|
if (!partition_map_.empty()) {
|
|
// The dependencies are already cached since `partition_map_` is filled in
|
|
// an all-or-nothing manner.
|
|
return Status::OK();
|
|
}
|
|
// Before read partitions, prefetch them to avoid lots of IOs
|
|
BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch};
|
|
const BlockBasedTable::Rep* rep = table()->rep_;
|
|
IndexBlockIter biter;
|
|
BlockHandle handle;
|
|
Statistics* kNullStats = nullptr;
|
|
|
|
CachableEntry<Block> index_block;
|
|
{
|
|
Status s = GetOrReadIndexBlock(false /* no_io */, nullptr /* get_context */,
|
|
&lookup_context, &index_block, ro);
|
|
if (!s.ok()) {
|
|
return s;
|
|
}
|
|
}
|
|
|
|
// We don't return pinned data from index blocks, so no need
|
|
// to set `block_contents_pinned`.
|
|
index_block.GetValue()->NewIndexIterator(
|
|
internal_comparator()->user_comparator(),
|
|
rep->get_global_seqno(BlockType::kIndex), &biter, kNullStats, true,
|
|
index_has_first_key(), index_key_includes_seq(), index_value_is_full(),
|
|
false /* block_contents_pinned */, user_defined_timestamps_persisted());
|
|
// Index partitions are assumed to be consecuitive. Prefetch them all.
|
|
// Read the first block offset
|
|
biter.SeekToFirst();
|
|
if (!biter.Valid()) {
|
|
// Empty index.
|
|
return biter.status();
|
|
}
|
|
handle = biter.value().handle;
|
|
uint64_t prefetch_off = handle.offset();
|
|
|
|
// Read the last block's offset
|
|
biter.SeekToLast();
|
|
if (!biter.Valid()) {
|
|
// Empty index.
|
|
return biter.status();
|
|
}
|
|
handle = biter.value().handle;
|
|
uint64_t last_off =
|
|
handle.offset() + BlockBasedTable::BlockSizeWithTrailer(handle);
|
|
uint64_t prefetch_len = last_off - prefetch_off;
|
|
std::unique_ptr<FilePrefetchBuffer> prefetch_buffer;
|
|
if (tail_prefetch_buffer == nullptr || !tail_prefetch_buffer->Enabled() ||
|
|
tail_prefetch_buffer->GetPrefetchOffset() > prefetch_off) {
|
|
rep->CreateFilePrefetchBuffer(
|
|
0, 0, &prefetch_buffer, false /*Implicit auto readahead*/,
|
|
0 /*num_reads_*/, 0 /*num_file_reads_for_auto_readahead*/);
|
|
IOOptions opts;
|
|
{
|
|
Status s = rep->file->PrepareIOOptions(ro, opts);
|
|
if (s.ok()) {
|
|
s = prefetch_buffer->Prefetch(opts, rep->file.get(), prefetch_off,
|
|
static_cast<size_t>(prefetch_len),
|
|
ro.rate_limiter_priority);
|
|
}
|
|
if (!s.ok()) {
|
|
return s;
|
|
}
|
|
}
|
|
}
|
|
// For saving "all or nothing" to partition_map_
|
|
UnorderedMap<uint64_t, CachableEntry<Block>> map_in_progress;
|
|
|
|
// After prefetch, read the partitions one by one
|
|
biter.SeekToFirst();
|
|
size_t partition_count = 0;
|
|
for (; biter.Valid(); biter.Next()) {
|
|
handle = biter.value().handle;
|
|
CachableEntry<Block> block;
|
|
++partition_count;
|
|
// TODO: Support counter batch update for partitioned index and
|
|
// filter blocks
|
|
Status s = table()->MaybeReadBlockAndLoadToCache(
|
|
prefetch_buffer ? prefetch_buffer.get() : tail_prefetch_buffer, ro,
|
|
handle, UncompressionDict::GetEmptyDict(),
|
|
/*for_compaction=*/false, &block.As<Block_kIndex>(),
|
|
/*get_context=*/nullptr, &lookup_context, /*contents=*/nullptr,
|
|
/*async_read=*/false);
|
|
|
|
if (!s.ok()) {
|
|
return s;
|
|
}
|
|
if (block.GetValue() != nullptr) {
|
|
// Might need to "pin" some mmap-read blocks (GetOwnValue) if some
|
|
// partitions are successfully compressed (cached) and some are not
|
|
// compressed (mmap eligible)
|
|
if (block.IsCached() || block.GetOwnValue()) {
|
|
if (pin) {
|
|
map_in_progress[handle.offset()] = std::move(block);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
Status s = biter.status();
|
|
// Save (pin) them only if everything checks out
|
|
if (map_in_progress.size() == partition_count && s.ok()) {
|
|
std::swap(partition_map_, map_in_progress);
|
|
}
|
|
return s;
|
|
}
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|