mirror of
https://github.com/facebook/rocksdb.git
synced 2024-11-28 05:43:50 +00:00
8f763bdeab
Summary: **Context:** We prefetch the tail part of a SST file (i.e, the blocks after data blocks till the end of the file) during each SST file open in hope to prefetch all the stuff at once ahead of time for later read e.g, footer, meta index, filter/index etc. The existing approach to estimate the tail size to prefetch is through `TailPrefetchStats` heuristics introduced in https://github.com/facebook/rocksdb/pull/4156, which has caused small reads in unlucky case (e.g, small read into the tail buffer during table open in thread 1 under the same BlockBasedTableFactory object can make thread 2's tail prefetching use a small size that it shouldn't) and is hard to debug. Therefore we decide to record the exact tail size and use it directly to prefetch tail of the SST instead of relying heuristics. **Summary:** - Obtain and record in manifest the tail size in `BlockBasedTableBuilder::Finish()` - For backward compatibility, we fall back to TailPrefetchStats and last to simple heuristics that the tail size is a linear portion of the file size - see PR conversation for more. - Make`tail_start_offset` part of the table properties and deduct tail size to record in manifest for external files (e.g, file ingestion, import CF) and db repair (with no access to manifest). Pull Request resolved: https://github.com/facebook/rocksdb/pull/11406 Test Plan: 1. New UT 2. db bench Note: db bench on /tmp/ where direct read is supported is too slow to finish and the default pinning setting in db bench is not helpful to profile # sst read of Get. Therefore I hacked the following to obtain the following comparison. ``` diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index bd5669f0f..791484c1f 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -838,7 +838,7 @@ Status BlockBasedTable::PrefetchTail( &tail_prefetch_size); // Try file system prefetch - if (!file->use_direct_io() && !force_direct_prefetch) { + if (false && !file->use_direct_io() && !force_direct_prefetch) { if (!file->Prefetch(prefetch_off, prefetch_len, ro.rate_limiter_priority) .IsNotSupported()) { prefetch_buffer->reset(new FilePrefetchBuffer( diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc index ea40f5fa0..39a0ac385 100644 --- a/tools/db_bench_tool.cc +++ b/tools/db_bench_tool.cc @@ -4191,6 +4191,8 @@ class Benchmark { std::shared_ptr<TableFactory>(NewCuckooTableFactory(table_options)); } else { BlockBasedTableOptions block_based_options; + block_based_options.metadata_cache_options.partition_pinning = + PinningTier::kAll; block_based_options.checksum = static_cast<ChecksumType>(FLAGS_checksum_type); if (FLAGS_use_hash_search) { ``` Create DB ``` ./db_bench --bloom_bits=3 --use_existing_db=1 --seed=1682546046158958 --partition_index_and_filters=1 --statistics=1 -db=/dev/shm/testdb/ -benchmarks=readrandom -key_size=3200 -value_size=512 -num=1000000 -write_buffer_size=6550000 -disable_auto_compactions=false -target_file_size_base=6550000 -compression_type=none ``` ReadRandom ``` ./db_bench --bloom_bits=3 --use_existing_db=1 --seed=1682546046158958 --partition_index_and_filters=1 --statistics=1 -db=/dev/shm/testdb/ -benchmarks=readrandom -key_size=3200 -value_size=512 -num=1000000 -write_buffer_size=6550000 -disable_auto_compactions=false -target_file_size_base=6550000 -compression_type=none ``` (a) Existing (Use TailPrefetchStats for tail size + use seperate prefetch buffer in PartitionedFilter/IndexReader::CacheDependencies()) ``` rocksdb.table.open.prefetch.tail.hit COUNT : 3395 rocksdb.sst.read.micros P50 : 5.655570 P95 : 9.931396 P99 : 14.845454 P100 : 585.000000 COUNT : 999905 SUM : 6590614 ``` (b) This PR (Record tail size + use the same tail buffer in PartitionedFilter/IndexReader::CacheDependencies()) ``` rocksdb.table.open.prefetch.tail.hit COUNT : 14257 rocksdb.sst.read.micros P50 : 5.173347 P95 : 9.015017 P99 : 12.912610 P100 : 228.000000 COUNT : 998547 SUM : 5976540 ``` As we can see, we increase the prefetch tail hit count and decrease SST read count with this PR 3. Test backward compatibility by stepping through reading with post-PR code on a db generated pre-PR. Reviewed By: pdillinger Differential Revision: D45413346 Pulled By: hx235 fbshipit-source-id: 7d5e36a60a72477218f79905168d688452a4c064
470 lines
19 KiB
C++
470 lines
19 KiB
C++
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
//
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
#pragma once
|
|
|
|
#include <algorithm>
|
|
#include <atomic>
|
|
#include <sstream>
|
|
#include <string>
|
|
|
|
#include "file/readahead_file_info.h"
|
|
#include "monitoring/statistics.h"
|
|
#include "port/port.h"
|
|
#include "rocksdb/env.h"
|
|
#include "rocksdb/file_system.h"
|
|
#include "rocksdb/options.h"
|
|
#include "util/aligned_buffer.h"
|
|
#include "util/autovector.h"
|
|
#include "util/stop_watch.h"
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
#define DEFAULT_DECREMENT 8 * 1024
|
|
|
|
struct IOOptions;
|
|
class RandomAccessFileReader;
|
|
|
|
struct BufferInfo {
|
|
AlignedBuffer buffer_;
|
|
|
|
uint64_t offset_ = 0;
|
|
|
|
// Below parameters are used in case of async read flow.
|
|
// Length requested for in ReadAsync.
|
|
size_t async_req_len_ = 0;
|
|
|
|
// async_read_in_progress can be used as mutex. Callback can update the buffer
|
|
// and its size but async_read_in_progress is only set by main thread.
|
|
bool async_read_in_progress_ = false;
|
|
|
|
// io_handle is allocated and used by underlying file system in case of
|
|
// asynchronous reads.
|
|
void* io_handle_ = nullptr;
|
|
|
|
IOHandleDeleter del_fn_ = nullptr;
|
|
|
|
// pos represents the index of this buffer in vector of BufferInfo.
|
|
uint32_t pos_ = 0;
|
|
};
|
|
|
|
enum class FilePrefetchBufferUsage {
|
|
kTableOpenPrefetchTail,
|
|
kUnknown,
|
|
};
|
|
|
|
// FilePrefetchBuffer is a smart buffer to store and read data from a file.
|
|
class FilePrefetchBuffer {
|
|
public:
|
|
// Constructor.
|
|
//
|
|
// All arguments are optional.
|
|
// readahead_size : the initial readahead size.
|
|
// max_readahead_size : the maximum readahead size.
|
|
// If max_readahead_size > readahead_size, the readahead size will be
|
|
// doubled on every IO until max_readahead_size is hit.
|
|
// Typically this is set as a multiple of readahead_size.
|
|
// max_readahead_size should be greater than equal to readahead_size.
|
|
// enable : controls whether reading from the buffer is enabled.
|
|
// If false, TryReadFromCache() always return false, and we only take stats
|
|
// for the minimum offset if track_min_offset = true.
|
|
// track_min_offset : Track the minimum offset ever read and collect stats on
|
|
// it. Used for adaptable readahead of the file footer/metadata.
|
|
// implicit_auto_readahead : Readahead is enabled implicitly by rocksdb after
|
|
// doing sequential scans for two times.
|
|
//
|
|
// Automatic readhead is enabled for a file if readahead_size
|
|
// and max_readahead_size are passed in.
|
|
// A user can construct a FilePrefetchBuffer without any arguments, but use
|
|
// `Prefetch` to load data into the buffer.
|
|
FilePrefetchBuffer(
|
|
size_t readahead_size = 0, size_t max_readahead_size = 0,
|
|
bool enable = true, bool track_min_offset = false,
|
|
bool implicit_auto_readahead = false, uint64_t num_file_reads = 0,
|
|
uint64_t num_file_reads_for_auto_readahead = 0, FileSystem* fs = nullptr,
|
|
SystemClock* clock = nullptr, Statistics* stats = nullptr,
|
|
FilePrefetchBufferUsage usage = FilePrefetchBufferUsage::kUnknown)
|
|
: curr_(0),
|
|
readahead_size_(readahead_size),
|
|
initial_auto_readahead_size_(readahead_size),
|
|
max_readahead_size_(max_readahead_size),
|
|
min_offset_read_(std::numeric_limits<size_t>::max()),
|
|
enable_(enable),
|
|
track_min_offset_(track_min_offset),
|
|
implicit_auto_readahead_(implicit_auto_readahead),
|
|
prev_offset_(0),
|
|
prev_len_(0),
|
|
num_file_reads_for_auto_readahead_(num_file_reads_for_auto_readahead),
|
|
num_file_reads_(num_file_reads),
|
|
explicit_prefetch_submitted_(false),
|
|
fs_(fs),
|
|
clock_(clock),
|
|
stats_(stats),
|
|
usage_(usage) {
|
|
assert((num_file_reads_ >= num_file_reads_for_auto_readahead_ + 1) ||
|
|
(num_file_reads_ == 0));
|
|
// If ReadOptions.async_io is enabled, data is asynchronously filled in
|
|
// second buffer while curr_ is being consumed. If data is overlapping in
|
|
// two buffers, data is copied to third buffer to return continuous buffer.
|
|
bufs_.resize(3);
|
|
for (uint32_t i = 0; i < 2; i++) {
|
|
bufs_[i].pos_ = i;
|
|
}
|
|
}
|
|
|
|
~FilePrefetchBuffer() {
|
|
// Abort any pending async read request before destroying the class object.
|
|
if (fs_ != nullptr) {
|
|
std::vector<void*> handles;
|
|
for (uint32_t i = 0; i < 2; i++) {
|
|
if (bufs_[i].async_read_in_progress_ &&
|
|
bufs_[i].io_handle_ != nullptr) {
|
|
handles.emplace_back(bufs_[i].io_handle_);
|
|
}
|
|
}
|
|
if (!handles.empty()) {
|
|
StopWatch sw(clock_, stats_, ASYNC_PREFETCH_ABORT_MICROS);
|
|
Status s = fs_->AbortIO(handles);
|
|
assert(s.ok());
|
|
}
|
|
}
|
|
|
|
// Prefetch buffer bytes discarded.
|
|
uint64_t bytes_discarded = 0;
|
|
// Iterated over 2 buffers.
|
|
for (int i = 0; i < 2; i++) {
|
|
int first = i;
|
|
int second = i ^ 1;
|
|
|
|
if (DoesBufferContainData(first)) {
|
|
// If last block was read completely from first and some bytes in
|
|
// first buffer are still unconsumed.
|
|
if (prev_offset_ >= bufs_[first].offset_ &&
|
|
prev_offset_ + prev_len_ <
|
|
bufs_[first].offset_ + bufs_[first].buffer_.CurrentSize()) {
|
|
bytes_discarded += bufs_[first].buffer_.CurrentSize() -
|
|
(prev_offset_ + prev_len_ - bufs_[first].offset_);
|
|
}
|
|
// If data was in second buffer and some/whole block bytes were read
|
|
// from second buffer.
|
|
else if (prev_offset_ < bufs_[first].offset_ &&
|
|
!DoesBufferContainData(second)) {
|
|
// If last block read was completely from different buffer, this
|
|
// buffer is unconsumed.
|
|
if (prev_offset_ + prev_len_ <= bufs_[first].offset_) {
|
|
bytes_discarded += bufs_[first].buffer_.CurrentSize();
|
|
}
|
|
// If last block read overlaps with this buffer and some data is
|
|
// still unconsumed and previous buffer (second) is not cleared.
|
|
else if (prev_offset_ + prev_len_ > bufs_[first].offset_ &&
|
|
bufs_[first].offset_ + bufs_[first].buffer_.CurrentSize() ==
|
|
bufs_[second].offset_) {
|
|
bytes_discarded += bufs_[first].buffer_.CurrentSize() -
|
|
(/*bytes read from this buffer=*/prev_len_ -
|
|
(bufs_[first].offset_ - prev_offset_));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
for (uint32_t i = 0; i < 2; i++) {
|
|
// Release io_handle.
|
|
DestroyAndClearIOHandle(i);
|
|
}
|
|
RecordInHistogram(stats_, PREFETCHED_BYTES_DISCARDED, bytes_discarded);
|
|
}
|
|
|
|
bool Enabled() const { return enable_; }
|
|
|
|
// Load data into the buffer from a file.
|
|
// reader : the file reader.
|
|
// offset : the file offset to start reading from.
|
|
// n : the number of bytes to read.
|
|
// rate_limiter_priority : rate limiting priority, or `Env::IO_TOTAL` to
|
|
// bypass.
|
|
Status Prefetch(const IOOptions& opts, RandomAccessFileReader* reader,
|
|
uint64_t offset, size_t n,
|
|
Env::IOPriority rate_limiter_priority);
|
|
|
|
// Request for reading the data from a file asynchronously.
|
|
// If data already exists in the buffer, result will be updated.
|
|
// reader : the file reader.
|
|
// offset : the file offset to start reading from.
|
|
// n : the number of bytes to read.
|
|
// result : if data already exists in the buffer, result will
|
|
// be updated with the data.
|
|
//
|
|
// If data already exist in the buffer, it will return Status::OK, otherwise
|
|
// it will send asynchronous request and return Status::TryAgain.
|
|
Status PrefetchAsync(const IOOptions& opts, RandomAccessFileReader* reader,
|
|
uint64_t offset, size_t n, Slice* result);
|
|
|
|
// Tries returning the data for a file read from this buffer if that data is
|
|
// in the buffer.
|
|
// It handles tracking the minimum read offset if track_min_offset = true.
|
|
// It also does the exponential readahead when readahead_size is set as part
|
|
// of the constructor.
|
|
//
|
|
// opts : the IO options to use.
|
|
// reader : the file reader.
|
|
// offset : the file offset.
|
|
// n : the number of bytes.
|
|
// result : output buffer to put the data into.
|
|
// s : output status.
|
|
// rate_limiter_priority : rate limiting priority, or `Env::IO_TOTAL` to
|
|
// bypass.
|
|
// for_compaction : true if cache read is done for compaction read.
|
|
bool TryReadFromCache(const IOOptions& opts, RandomAccessFileReader* reader,
|
|
uint64_t offset, size_t n, Slice* result, Status* s,
|
|
Env::IOPriority rate_limiter_priority,
|
|
bool for_compaction = false);
|
|
|
|
bool TryReadFromCacheAsync(const IOOptions& opts,
|
|
RandomAccessFileReader* reader, uint64_t offset,
|
|
size_t n, Slice* result, Status* status,
|
|
Env::IOPriority rate_limiter_priority);
|
|
|
|
// The minimum `offset` ever passed to TryReadFromCache(). This will nly be
|
|
// tracked if track_min_offset = true.
|
|
size_t min_offset_read() const { return min_offset_read_; }
|
|
|
|
// Called in case of implicit auto prefetching.
|
|
void UpdateReadPattern(const uint64_t& offset, const size_t& len,
|
|
bool decrease_readaheadsize) {
|
|
if (decrease_readaheadsize) {
|
|
// Since this block was eligible for prefetch but it was found in
|
|
// cache, so check and decrease the readahead_size by 8KB (default)
|
|
// if eligible.
|
|
DecreaseReadAheadIfEligible(offset, len);
|
|
}
|
|
prev_offset_ = offset;
|
|
prev_len_ = len;
|
|
explicit_prefetch_submitted_ = false;
|
|
}
|
|
|
|
void GetReadaheadState(ReadaheadFileInfo::ReadaheadInfo* readahead_info) {
|
|
readahead_info->readahead_size = readahead_size_;
|
|
readahead_info->num_file_reads = num_file_reads_;
|
|
}
|
|
|
|
void DecreaseReadAheadIfEligible(uint64_t offset, size_t size,
|
|
size_t value = DEFAULT_DECREMENT) {
|
|
// Decrease the readahead_size if
|
|
// - its enabled internally by RocksDB (implicit_auto_readahead_) and,
|
|
// - readahead_size is greater than 0 and,
|
|
// - this block would have called prefetch API if not found in cache for
|
|
// which conditions are:
|
|
// - few/no bytes are in buffer and,
|
|
// - block is sequential with the previous read and,
|
|
// - num_file_reads_ + 1 (including this read) >
|
|
// num_file_reads_for_auto_readahead_
|
|
size_t curr_size = bufs_[curr_].async_read_in_progress_
|
|
? bufs_[curr_].async_req_len_
|
|
: bufs_[curr_].buffer_.CurrentSize();
|
|
if (implicit_auto_readahead_ && readahead_size_ > 0) {
|
|
if ((offset + size > bufs_[curr_].offset_ + curr_size) &&
|
|
IsBlockSequential(offset) &&
|
|
(num_file_reads_ + 1 > num_file_reads_for_auto_readahead_)) {
|
|
readahead_size_ =
|
|
std::max(initial_auto_readahead_size_,
|
|
(readahead_size_ >= value ? readahead_size_ - value : 0));
|
|
}
|
|
}
|
|
}
|
|
|
|
// Callback function passed to underlying FS in case of asynchronous reads.
|
|
void PrefetchAsyncCallback(const FSReadRequest& req, void* cb_arg);
|
|
|
|
private:
|
|
// Calculates roundoff offset and length to be prefetched based on alignment
|
|
// and data present in buffer_. It also allocates new buffer or refit tail if
|
|
// required.
|
|
void CalculateOffsetAndLen(size_t alignment, uint64_t offset,
|
|
size_t roundup_len, uint32_t index,
|
|
bool refit_tail, uint64_t& chunk_len);
|
|
|
|
void AbortIOIfNeeded(uint64_t offset);
|
|
|
|
void AbortAllIOs();
|
|
|
|
void UpdateBuffersIfNeeded(uint64_t offset);
|
|
|
|
// It calls Poll API if any there is any pending asynchronous request. It then
|
|
// checks if data is in any buffer. It clears the outdated data and swaps the
|
|
// buffers if required.
|
|
void PollAndUpdateBuffersIfNeeded(uint64_t offset);
|
|
|
|
Status PrefetchAsyncInternal(const IOOptions& opts,
|
|
RandomAccessFileReader* reader, uint64_t offset,
|
|
size_t length, size_t readahead_size,
|
|
Env::IOPriority rate_limiter_priority,
|
|
bool& copy_to_third_buffer);
|
|
|
|
Status Read(const IOOptions& opts, RandomAccessFileReader* reader,
|
|
Env::IOPriority rate_limiter_priority, uint64_t read_len,
|
|
uint64_t chunk_len, uint64_t rounddown_start, uint32_t index);
|
|
|
|
Status ReadAsync(const IOOptions& opts, RandomAccessFileReader* reader,
|
|
uint64_t read_len, uint64_t rounddown_start, uint32_t index);
|
|
|
|
// Copy the data from src to third buffer.
|
|
void CopyDataToBuffer(uint32_t src, uint64_t& offset, size_t& length);
|
|
|
|
bool IsBlockSequential(const size_t& offset) {
|
|
return (prev_len_ == 0 || (prev_offset_ + prev_len_ == offset));
|
|
}
|
|
|
|
// Called in case of implicit auto prefetching.
|
|
void ResetValues() {
|
|
num_file_reads_ = 1;
|
|
readahead_size_ = initial_auto_readahead_size_;
|
|
}
|
|
|
|
// Called in case of implicit auto prefetching.
|
|
bool IsEligibleForPrefetch(uint64_t offset, size_t n) {
|
|
// Prefetch only if this read is sequential otherwise reset readahead_size_
|
|
// to initial value.
|
|
if (!IsBlockSequential(offset)) {
|
|
UpdateReadPattern(offset, n, false /*decrease_readaheadsize*/);
|
|
ResetValues();
|
|
return false;
|
|
}
|
|
num_file_reads_++;
|
|
|
|
// Since async request was submitted in last call directly by calling
|
|
// PrefetchAsync, it skips num_file_reads_ check as this call is to poll the
|
|
// data submitted in previous call.
|
|
if (explicit_prefetch_submitted_) {
|
|
return true;
|
|
}
|
|
if (num_file_reads_ <= num_file_reads_for_auto_readahead_) {
|
|
UpdateReadPattern(offset, n, false /*decrease_readaheadsize*/);
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
// Helper functions.
|
|
bool IsDataBlockInBuffer(uint64_t offset, size_t length, uint32_t index) {
|
|
return (offset >= bufs_[index].offset_ &&
|
|
offset + length <=
|
|
bufs_[index].offset_ + bufs_[index].buffer_.CurrentSize());
|
|
}
|
|
bool IsOffsetInBuffer(uint64_t offset, uint32_t index) {
|
|
return (offset >= bufs_[index].offset_ &&
|
|
offset < bufs_[index].offset_ + bufs_[index].buffer_.CurrentSize());
|
|
}
|
|
bool DoesBufferContainData(uint32_t index) {
|
|
return bufs_[index].buffer_.CurrentSize() > 0;
|
|
}
|
|
bool IsBufferOutdated(uint64_t offset, uint32_t index) {
|
|
return (
|
|
!bufs_[index].async_read_in_progress_ && DoesBufferContainData(index) &&
|
|
offset >= bufs_[index].offset_ + bufs_[index].buffer_.CurrentSize());
|
|
}
|
|
bool IsBufferOutdatedWithAsyncProgress(uint64_t offset, uint32_t index) {
|
|
return (bufs_[index].async_read_in_progress_ &&
|
|
bufs_[index].io_handle_ != nullptr &&
|
|
offset >= bufs_[index].offset_ + bufs_[index].async_req_len_);
|
|
}
|
|
bool IsOffsetInBufferWithAsyncProgress(uint64_t offset, uint32_t index) {
|
|
return (bufs_[index].async_read_in_progress_ &&
|
|
offset >= bufs_[index].offset_ &&
|
|
offset < bufs_[index].offset_ + bufs_[index].async_req_len_);
|
|
}
|
|
|
|
bool IsSecondBuffEligibleForPrefetching() {
|
|
uint32_t second = curr_ ^ 1;
|
|
if (bufs_[second].async_read_in_progress_) {
|
|
return false;
|
|
}
|
|
assert(!bufs_[curr_].async_read_in_progress_);
|
|
|
|
if (DoesBufferContainData(curr_) && DoesBufferContainData(second) &&
|
|
(bufs_[curr_].offset_ + bufs_[curr_].buffer_.CurrentSize() ==
|
|
bufs_[second].offset_)) {
|
|
return false;
|
|
}
|
|
bufs_[second].buffer_.Clear();
|
|
return true;
|
|
}
|
|
|
|
void DestroyAndClearIOHandle(uint32_t index) {
|
|
if (bufs_[index].io_handle_ != nullptr && bufs_[index].del_fn_ != nullptr) {
|
|
bufs_[index].del_fn_(bufs_[index].io_handle_);
|
|
bufs_[index].io_handle_ = nullptr;
|
|
bufs_[index].del_fn_ = nullptr;
|
|
}
|
|
bufs_[index].async_read_in_progress_ = false;
|
|
}
|
|
|
|
Status HandleOverlappingData(const IOOptions& opts,
|
|
RandomAccessFileReader* reader, uint64_t offset,
|
|
size_t length, size_t readahead_size,
|
|
Env::IOPriority rate_limiter_priority,
|
|
bool& copy_to_third_buffer, uint64_t& tmp_offset,
|
|
size_t& tmp_length);
|
|
|
|
bool TryReadFromCacheUntracked(const IOOptions& opts,
|
|
RandomAccessFileReader* reader,
|
|
uint64_t offset, size_t n, Slice* result,
|
|
Status* s,
|
|
Env::IOPriority rate_limiter_priority,
|
|
bool for_compaction = false);
|
|
|
|
bool TryReadFromCacheAsyncUntracked(const IOOptions& opts,
|
|
RandomAccessFileReader* reader,
|
|
uint64_t offset, size_t n, Slice* result,
|
|
Status* status,
|
|
Env::IOPriority rate_limiter_priority);
|
|
|
|
std::vector<BufferInfo> bufs_;
|
|
// curr_ represents the index for bufs_ indicating which buffer is being
|
|
// consumed currently.
|
|
uint32_t curr_;
|
|
|
|
size_t readahead_size_;
|
|
size_t initial_auto_readahead_size_;
|
|
// FilePrefetchBuffer object won't be created from Iterator flow if
|
|
// max_readahead_size_ = 0.
|
|
size_t max_readahead_size_;
|
|
|
|
// The minimum `offset` ever passed to TryReadFromCache().
|
|
size_t min_offset_read_;
|
|
// if false, TryReadFromCache() always return false, and we only take stats
|
|
// for track_min_offset_ if track_min_offset_ = true
|
|
bool enable_;
|
|
// If true, track minimum `offset` ever passed to TryReadFromCache(), which
|
|
// can be fetched from min_offset_read().
|
|
bool track_min_offset_;
|
|
|
|
// implicit_auto_readahead is enabled by rocksdb internally after 2
|
|
// sequential IOs.
|
|
bool implicit_auto_readahead_;
|
|
uint64_t prev_offset_;
|
|
size_t prev_len_;
|
|
// num_file_reads_ and num_file_reads_for_auto_readahead_ is only used when
|
|
// implicit_auto_readahead_ is set.
|
|
uint64_t num_file_reads_for_auto_readahead_;
|
|
uint64_t num_file_reads_;
|
|
|
|
// If explicit_prefetch_submitted_ is set then it indicates RocksDB called
|
|
// PrefetchAsync to submit request. It needs to call TryReadFromCacheAsync to
|
|
// poll the submitted request without checking if data is sequential and
|
|
// num_file_reads_.
|
|
bool explicit_prefetch_submitted_;
|
|
|
|
FileSystem* fs_;
|
|
SystemClock* clock_;
|
|
Statistics* stats_;
|
|
|
|
FilePrefetchBufferUsage usage_;
|
|
};
|
|
} // namespace ROCKSDB_NAMESPACE
|