mirror of https://github.com/facebook/rocksdb.git
993 lines
36 KiB
C++
993 lines
36 KiB
C++
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
//
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
#include "file/file_prefetch_buffer.h"
|
|
|
|
#include <algorithm>
|
|
#include <cassert>
|
|
|
|
#include "file/random_access_file_reader.h"
|
|
#include "monitoring/histogram.h"
|
|
#include "monitoring/iostats_context_imp.h"
|
|
#include "port/port.h"
|
|
#include "test_util/sync_point.h"
|
|
#include "util/random.h"
|
|
#include "util/rate_limiter_impl.h"
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
void FilePrefetchBuffer::PrepareBufferForRead(BufferInfo* buf, size_t alignment,
|
|
uint64_t offset,
|
|
size_t roundup_len,
|
|
bool refit_tail,
|
|
uint64_t& aligned_useful_len) {
|
|
uint64_t aligned_useful_offset_in_buf = 0;
|
|
bool copy_data_to_new_buffer = false;
|
|
// Check if requested bytes are in the existing buffer_.
|
|
// If only a few bytes exist -- reuse them & read only what is really needed.
|
|
// This is typically the case of incremental reading of data.
|
|
// If no bytes exist in buffer -- full pread.
|
|
if (buf->DoesBufferContainData() && buf->IsOffsetInBuffer(offset)) {
|
|
// Only a few requested bytes are in the buffer. memmove those chunk of
|
|
// bytes to the beginning, and memcpy them back into the new buffer if a
|
|
// new buffer is created.
|
|
aligned_useful_offset_in_buf =
|
|
Rounddown(static_cast<size_t>(offset - buf->offset_), alignment);
|
|
aligned_useful_len = static_cast<uint64_t>(buf->CurrentSize()) -
|
|
aligned_useful_offset_in_buf;
|
|
assert(aligned_useful_offset_in_buf % alignment == 0);
|
|
assert(aligned_useful_len % alignment == 0);
|
|
assert(aligned_useful_offset_in_buf + aligned_useful_len <=
|
|
buf->offset_ + buf->CurrentSize());
|
|
if (aligned_useful_len > 0) {
|
|
copy_data_to_new_buffer = true;
|
|
} else {
|
|
// this reset is not necessary, but just to be safe.
|
|
aligned_useful_offset_in_buf = 0;
|
|
}
|
|
}
|
|
|
|
// Create a new buffer only if current capacity is not sufficient, and memcopy
|
|
// bytes from old buffer if needed (i.e., if aligned_useful_len is greater
|
|
// than 0).
|
|
if (buf->buffer_.Capacity() < roundup_len) {
|
|
buf->buffer_.Alignment(alignment);
|
|
buf->buffer_.AllocateNewBuffer(
|
|
static_cast<size_t>(roundup_len), copy_data_to_new_buffer,
|
|
aligned_useful_offset_in_buf, static_cast<size_t>(aligned_useful_len));
|
|
} else if (aligned_useful_len > 0 && refit_tail) {
|
|
// New buffer not needed. But memmove bytes from tail to the beginning since
|
|
// aligned_useful_len is greater than 0.
|
|
buf->buffer_.RefitTail(static_cast<size_t>(aligned_useful_offset_in_buf),
|
|
static_cast<size_t>(aligned_useful_len));
|
|
} else if (aligned_useful_len > 0) {
|
|
// For async prefetching, it doesn't call RefitTail with aligned_useful_len
|
|
// > 0. Allocate new buffer if needed because aligned buffer calculate
|
|
// remaining buffer as capacity - cursize which might not be the case in
|
|
// this as it's not refitting.
|
|
// TODO: Use refit_tail for async prefetching too.
|
|
buf->buffer_.Alignment(alignment);
|
|
buf->buffer_.AllocateNewBuffer(
|
|
static_cast<size_t>(roundup_len), copy_data_to_new_buffer,
|
|
aligned_useful_offset_in_buf, static_cast<size_t>(aligned_useful_len));
|
|
}
|
|
}
|
|
|
|
Status FilePrefetchBuffer::Read(BufferInfo* buf, const IOOptions& opts,
|
|
RandomAccessFileReader* reader,
|
|
uint64_t read_len, uint64_t aligned_useful_len,
|
|
uint64_t start_offset) {
|
|
Slice result;
|
|
char* to_buf = buf->buffer_.BufferStart() + aligned_useful_len;
|
|
Status s = reader->Read(opts, start_offset + aligned_useful_len, read_len,
|
|
&result, to_buf, /*aligned_buf=*/nullptr);
|
|
#ifndef NDEBUG
|
|
if (result.size() < read_len) {
|
|
// Fake an IO error to force db_stress fault injection to ignore
|
|
// truncated read errors
|
|
IGNORE_STATUS_IF_ERROR(Status::IOError());
|
|
}
|
|
#endif
|
|
if (!s.ok()) {
|
|
return s;
|
|
}
|
|
if (result.data() != to_buf) {
|
|
// If the read is coming from some other buffer already in memory (such as
|
|
// mmap) then it would be inefficient to create another copy in this
|
|
// FilePrefetchBuffer. The caller is expected to exclude this case.
|
|
assert(false);
|
|
return Status::Corruption("File read didn't populate our buffer");
|
|
}
|
|
|
|
if (usage_ == FilePrefetchBufferUsage::kUserScanPrefetch) {
|
|
RecordTick(stats_, PREFETCH_BYTES, read_len);
|
|
}
|
|
// Update the buffer size.
|
|
buf->buffer_.Size(static_cast<size_t>(aligned_useful_len) + result.size());
|
|
return s;
|
|
}
|
|
|
|
Status FilePrefetchBuffer::ReadAsync(BufferInfo* buf, const IOOptions& opts,
|
|
RandomAccessFileReader* reader,
|
|
uint64_t read_len, uint64_t start_offset) {
|
|
TEST_SYNC_POINT("FilePrefetchBuffer::ReadAsync");
|
|
// callback for async read request.
|
|
auto fp = std::bind(&FilePrefetchBuffer::PrefetchAsyncCallback, this,
|
|
std::placeholders::_1, std::placeholders::_2);
|
|
FSReadRequest req;
|
|
Slice result;
|
|
req.len = read_len;
|
|
req.offset = start_offset;
|
|
req.result = result;
|
|
req.scratch = buf->buffer_.BufferStart();
|
|
buf->async_req_len_ = req.len;
|
|
|
|
Status s = reader->ReadAsync(req, opts, fp, buf, &(buf->io_handle_),
|
|
&(buf->del_fn_), /*aligned_buf =*/nullptr);
|
|
req.status.PermitUncheckedError();
|
|
if (s.ok()) {
|
|
RecordTick(stats_, PREFETCH_BYTES, read_len);
|
|
buf->async_read_in_progress_ = true;
|
|
}
|
|
return s;
|
|
}
|
|
|
|
Status FilePrefetchBuffer::Prefetch(const IOOptions& opts,
|
|
RandomAccessFileReader* reader,
|
|
uint64_t offset, size_t n) {
|
|
if (!enable_ || reader == nullptr) {
|
|
return Status::OK();
|
|
}
|
|
|
|
assert(num_buffers_ == 1);
|
|
|
|
AllocateBufferIfEmpty();
|
|
BufferInfo* buf = GetFirstBuffer();
|
|
|
|
TEST_SYNC_POINT("FilePrefetchBuffer::Prefetch:Start");
|
|
|
|
if (offset + n <= buf->offset_ + buf->CurrentSize()) {
|
|
// All requested bytes are already in the buffer. So no need to Read again.
|
|
return Status::OK();
|
|
}
|
|
|
|
size_t alignment = reader->file()->GetRequiredBufferAlignment();
|
|
uint64_t rounddown_offset = offset, roundup_end = 0, aligned_useful_len = 0;
|
|
size_t read_len = 0;
|
|
|
|
ReadAheadSizeTuning(buf, /*read_curr_block=*/true,
|
|
/*refit_tail=*/true, rounddown_offset, alignment, 0, n,
|
|
rounddown_offset, roundup_end, read_len,
|
|
aligned_useful_len);
|
|
|
|
Status s;
|
|
if (read_len > 0) {
|
|
s = Read(buf, opts, reader, read_len, aligned_useful_len, rounddown_offset);
|
|
}
|
|
|
|
if (usage_ == FilePrefetchBufferUsage::kTableOpenPrefetchTail && s.ok()) {
|
|
RecordInHistogram(stats_, TABLE_OPEN_PREFETCH_TAIL_READ_BYTES, read_len);
|
|
}
|
|
return s;
|
|
}
|
|
|
|
// Copy data from src to overlap_buf_.
|
|
void FilePrefetchBuffer::CopyDataToBuffer(BufferInfo* src, uint64_t& offset,
|
|
size_t& length) {
|
|
if (length == 0) {
|
|
return;
|
|
}
|
|
|
|
uint64_t copy_offset = (offset - src->offset_);
|
|
size_t copy_len = 0;
|
|
if (src->IsDataBlockInBuffer(offset, length)) {
|
|
// All the bytes are in src.
|
|
copy_len = length;
|
|
} else {
|
|
copy_len = src->CurrentSize() - copy_offset;
|
|
}
|
|
|
|
BufferInfo* dst = overlap_buf_;
|
|
memcpy(dst->buffer_.BufferStart() + dst->CurrentSize(),
|
|
src->buffer_.BufferStart() + copy_offset, copy_len);
|
|
|
|
dst->buffer_.Size(dst->CurrentSize() + copy_len);
|
|
|
|
// Update offset and length.
|
|
offset += copy_len;
|
|
length -= copy_len;
|
|
|
|
// length > 0 indicates it has consumed all data from the src buffer and it
|
|
// still needs to read more other buffer.
|
|
if (length > 0) {
|
|
FreeFrontBuffer();
|
|
}
|
|
}
|
|
|
|
// Clear the buffers if it contains outdated data. Outdated data can be because
|
|
// previous sequential reads were read from the cache instead of these buffer.
|
|
// In that case outdated IOs should be aborted.
|
|
void FilePrefetchBuffer::AbortOutdatedIO(uint64_t offset) {
|
|
std::vector<void*> handles;
|
|
std::vector<BufferInfo*> tmp_buf;
|
|
for (auto& buf : bufs_) {
|
|
if (buf->IsBufferOutdatedWithAsyncProgress(offset)) {
|
|
handles.emplace_back(buf->io_handle_);
|
|
tmp_buf.emplace_back(buf);
|
|
}
|
|
}
|
|
|
|
if (!handles.empty()) {
|
|
StopWatch sw(clock_, stats_, ASYNC_PREFETCH_ABORT_MICROS);
|
|
Status s = fs_->AbortIO(handles);
|
|
assert(s.ok());
|
|
}
|
|
|
|
for (auto& buf : tmp_buf) {
|
|
if (buf->async_read_in_progress_) {
|
|
DestroyAndClearIOHandle(buf);
|
|
buf->async_read_in_progress_ = false;
|
|
}
|
|
buf->ClearBuffer();
|
|
}
|
|
}
|
|
|
|
void FilePrefetchBuffer::AbortAllIOs() {
|
|
std::vector<void*> handles;
|
|
for (auto& buf : bufs_) {
|
|
if (buf->async_read_in_progress_ && buf->io_handle_ != nullptr) {
|
|
handles.emplace_back(buf->io_handle_);
|
|
}
|
|
}
|
|
if (!handles.empty()) {
|
|
StopWatch sw(clock_, stats_, ASYNC_PREFETCH_ABORT_MICROS);
|
|
Status s = fs_->AbortIO(handles);
|
|
assert(s.ok());
|
|
}
|
|
|
|
for (auto& buf : bufs_) {
|
|
if (buf->io_handle_ != nullptr && buf->del_fn_ != nullptr) {
|
|
DestroyAndClearIOHandle(buf);
|
|
}
|
|
buf->async_read_in_progress_ = false;
|
|
}
|
|
}
|
|
|
|
// Clear the buffers if it contains outdated data wrt offset. Outdated data can
|
|
// be because previous sequential reads were read from the cache instead of
|
|
// these buffer or there is IOError while filling the buffers.
|
|
//
|
|
// offset - the offset requested to be read. This API makes sure that the
|
|
// front/first buffer in bufs_ should contain this offset, otherwise, all
|
|
// buffers will be freed.
|
|
void FilePrefetchBuffer::ClearOutdatedData(uint64_t offset, size_t length) {
|
|
while (!IsBufferQueueEmpty()) {
|
|
BufferInfo* buf = GetFirstBuffer();
|
|
// Offset is greater than this buffer's end offset.
|
|
if (buf->IsBufferOutdated(offset)) {
|
|
FreeFrontBuffer();
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (IsBufferQueueEmpty() || NumBuffersAllocated() == 1) {
|
|
return;
|
|
}
|
|
|
|
BufferInfo* buf = GetFirstBuffer();
|
|
|
|
if (buf->async_read_in_progress_) {
|
|
FreeEmptyBuffers();
|
|
return;
|
|
}
|
|
|
|
// Below handles the case for Overlapping buffers (NumBuffersAllocated > 1).
|
|
bool abort_io = false;
|
|
|
|
if (buf->DoesBufferContainData() && buf->IsOffsetInBuffer(offset)) {
|
|
BufferInfo* next_buf = bufs_[1];
|
|
if (/* next buffer doesn't align with first buffer and requested data
|
|
overlaps with next buffer */
|
|
((buf->offset_ + buf->CurrentSize() != next_buf->offset_) &&
|
|
(offset + length > buf->offset_ + buf->CurrentSize()))) {
|
|
abort_io = true;
|
|
}
|
|
} else {
|
|
// buffer with offset doesn't contain data or offset doesn't lie in this
|
|
// buffer.
|
|
buf->ClearBuffer();
|
|
abort_io = true;
|
|
}
|
|
|
|
if (abort_io) {
|
|
AbortAllIOs();
|
|
// Clear all buffers after first.
|
|
for (size_t i = 1; i < bufs_.size(); ++i) {
|
|
bufs_[i]->ClearBuffer();
|
|
}
|
|
}
|
|
FreeEmptyBuffers();
|
|
assert(IsBufferQueueEmpty() || buf->IsOffsetInBuffer(offset));
|
|
}
|
|
|
|
void FilePrefetchBuffer::PollIfNeeded(uint64_t offset, size_t length) {
|
|
BufferInfo* buf = GetFirstBuffer();
|
|
|
|
if (buf->async_read_in_progress_ && fs_ != nullptr) {
|
|
if (buf->io_handle_ != nullptr) {
|
|
// Wait for prefetch data to complete.
|
|
// No mutex is needed as async_read_in_progress behaves as mutex and is
|
|
// updated by main thread only.
|
|
std::vector<void*> handles;
|
|
handles.emplace_back(buf->io_handle_);
|
|
StopWatch sw(clock_, stats_, POLL_WAIT_MICROS);
|
|
fs_->Poll(handles, 1).PermitUncheckedError();
|
|
}
|
|
|
|
// Reset and Release io_handle after the Poll API as request has been
|
|
// completed.
|
|
DestroyAndClearIOHandle(buf);
|
|
}
|
|
|
|
// Always call outdated data after Poll as Buffers might be out of sync w.r.t
|
|
// offset and length.
|
|
ClearOutdatedData(offset, length);
|
|
}
|
|
|
|
// ReadAheadSizeTuning API calls readaheadsize_cb_
|
|
// (BlockBasedTableIterator::BlockCacheLookupForReadAheadSize) to lookup in the
|
|
// cache and tune the start and end offsets based on cache hits/misses.
|
|
//
|
|
// Arguments -
|
|
// read_curr_block : True if this call was due to miss in the cache and
|
|
// FilePrefetchBuffer wants to read that block
|
|
// synchronously.
|
|
// False if current call is to prefetch additional data in
|
|
// extra buffers through ReadAsync API.
|
|
// prev_buf_end_offset : End offset of the previous buffer. It's used in case
|
|
// of ReadAsync to make sure it doesn't read anything from
|
|
// previous buffer which is already prefetched.
|
|
void FilePrefetchBuffer::ReadAheadSizeTuning(
|
|
BufferInfo* buf, bool read_curr_block, bool refit_tail,
|
|
uint64_t prev_buf_end_offset, size_t alignment, size_t length,
|
|
size_t readahead_size, uint64_t& start_offset, uint64_t& end_offset,
|
|
size_t& read_len, uint64_t& aligned_useful_len) {
|
|
uint64_t updated_start_offset = Rounddown(start_offset, alignment);
|
|
uint64_t updated_end_offset =
|
|
Roundup(start_offset + length + readahead_size, alignment);
|
|
uint64_t initial_end_offset = updated_end_offset;
|
|
uint64_t initial_start_offset = updated_start_offset;
|
|
|
|
// Callback to tune the start and end offsets.
|
|
if (readaheadsize_cb_ != nullptr && readahead_size > 0) {
|
|
readaheadsize_cb_(read_curr_block, updated_start_offset,
|
|
updated_end_offset);
|
|
}
|
|
|
|
// read_len will be 0 and there is nothing to read/prefetch.
|
|
if (updated_start_offset == updated_end_offset) {
|
|
start_offset = end_offset = updated_start_offset;
|
|
UpdateReadAheadTrimmedStat((initial_end_offset - initial_start_offset),
|
|
(updated_end_offset - updated_start_offset));
|
|
return;
|
|
}
|
|
|
|
assert(updated_start_offset < updated_end_offset);
|
|
|
|
if (!read_curr_block) {
|
|
// Handle the case when callback added block handles which are already
|
|
// prefetched and nothing new needs to be prefetched. In that case end
|
|
// offset updated by callback will be less than prev_buf_end_offset which
|
|
// means data has been already prefetched.
|
|
if (updated_end_offset <= prev_buf_end_offset) {
|
|
start_offset = end_offset = prev_buf_end_offset;
|
|
UpdateReadAheadTrimmedStat((initial_end_offset - initial_start_offset),
|
|
(end_offset - start_offset));
|
|
return;
|
|
}
|
|
}
|
|
|
|
// Realign if start and end offsets are not aligned after tuning.
|
|
start_offset = Rounddown(updated_start_offset, alignment);
|
|
end_offset = Roundup(updated_end_offset, alignment);
|
|
|
|
if (!read_curr_block && start_offset < prev_buf_end_offset) {
|
|
// Previous buffer already contains the data till prev_buf_end_offset
|
|
// because of alignment. Update the start offset after that to avoid
|
|
// prefetching it again.
|
|
start_offset = prev_buf_end_offset;
|
|
}
|
|
|
|
uint64_t roundup_len = end_offset - start_offset;
|
|
|
|
PrepareBufferForRead(buf, alignment, start_offset, roundup_len, refit_tail,
|
|
aligned_useful_len);
|
|
assert(roundup_len >= aligned_useful_len);
|
|
|
|
// Update the buffer offset.
|
|
buf->offset_ = start_offset;
|
|
// Update the initial end offset of this buffer which will be the starting
|
|
// offset of next prefetch.
|
|
buf->initial_end_offset_ = initial_end_offset;
|
|
read_len = static_cast<size_t>(roundup_len - aligned_useful_len);
|
|
|
|
UpdateReadAheadTrimmedStat((initial_end_offset - initial_start_offset),
|
|
(end_offset - start_offset));
|
|
}
|
|
|
|
// If data is overlapping between two buffers then during this call:
|
|
// - data from first buffer is copied into overlapping buffer,
|
|
// - first is removed from bufs_ and freed so that it can be used for async
|
|
// prefetching of further data.
|
|
Status FilePrefetchBuffer::HandleOverlappingData(
|
|
const IOOptions& opts, RandomAccessFileReader* reader, uint64_t offset,
|
|
size_t length, size_t readahead_size, bool& copy_to_overlap_buffer,
|
|
uint64_t& tmp_offset, size_t& tmp_length) {
|
|
// No Overlapping of data between 2 buffers.
|
|
if (IsBufferQueueEmpty() || NumBuffersAllocated() == 1) {
|
|
return Status::OK();
|
|
}
|
|
|
|
Status s;
|
|
size_t alignment = reader->file()->GetRequiredBufferAlignment();
|
|
|
|
BufferInfo* buf = GetFirstBuffer();
|
|
|
|
// Check if the first buffer has the required offset and the async read is
|
|
// still in progress. This should only happen if a prefetch was initiated
|
|
// by Seek, but the next access is at another offset.
|
|
if (buf->async_read_in_progress_ &&
|
|
buf->IsOffsetInBufferWithAsyncProgress(offset)) {
|
|
PollIfNeeded(offset, length);
|
|
}
|
|
|
|
if (IsBufferQueueEmpty() || NumBuffersAllocated() == 1) {
|
|
return Status::OK();
|
|
}
|
|
|
|
BufferInfo* next_buf = bufs_[1];
|
|
|
|
// If data is overlapping over two buffers, copy the data from front and
|
|
// call ReadAsync on freed buffer.
|
|
if (!buf->async_read_in_progress_ && buf->DoesBufferContainData() &&
|
|
buf->IsOffsetInBuffer(offset) &&
|
|
(/*Data extends over two buffers and second buffer either has data or in
|
|
process of population=*/
|
|
(offset + length > next_buf->offset_) &&
|
|
(next_buf->async_read_in_progress_ ||
|
|
next_buf->DoesBufferContainData()))) {
|
|
// Allocate new buffer to overlap_buf_.
|
|
overlap_buf_->ClearBuffer();
|
|
overlap_buf_->buffer_.Alignment(alignment);
|
|
overlap_buf_->buffer_.AllocateNewBuffer(length);
|
|
overlap_buf_->offset_ = offset;
|
|
copy_to_overlap_buffer = true;
|
|
|
|
CopyDataToBuffer(buf, tmp_offset, tmp_length);
|
|
UpdateStats(/*found_in_buffer=*/false, overlap_buf_->CurrentSize());
|
|
|
|
// Call async prefetching on freed buffer since data has been consumed
|
|
// only if requested data lies within next buffer.
|
|
size_t second_size = next_buf->async_read_in_progress_
|
|
? next_buf->async_req_len_
|
|
: next_buf->CurrentSize();
|
|
uint64_t start_offset = next_buf->initial_end_offset_;
|
|
|
|
// If requested bytes - tmp_offset + tmp_length are in next buffer, freed
|
|
// buffer can go for further prefetching.
|
|
// If requested bytes are not in next buffer, next buffer has to go for sync
|
|
// call to get remaining requested bytes. In that case it shouldn't go for
|
|
// async prefetching as async prefetching calculates offset based on
|
|
// previous buffer end offset and previous buffer has to go for sync
|
|
// prefetching.
|
|
|
|
if (tmp_offset + tmp_length <= next_buf->offset_ + second_size) {
|
|
AllocateBuffer();
|
|
BufferInfo* new_buf = GetLastBuffer();
|
|
size_t read_len = 0;
|
|
uint64_t end_offset = start_offset, aligned_useful_len = 0;
|
|
|
|
ReadAheadSizeTuning(new_buf, /*read_curr_block=*/false,
|
|
/*refit_tail=*/false, next_buf->offset_ + second_size,
|
|
alignment,
|
|
/*length=*/0, readahead_size, start_offset,
|
|
end_offset, read_len, aligned_useful_len);
|
|
if (read_len > 0) {
|
|
s = ReadAsync(new_buf, opts, reader, read_len, start_offset);
|
|
if (!s.ok()) {
|
|
DestroyAndClearIOHandle(new_buf);
|
|
FreeLastBuffer();
|
|
return s;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return s;
|
|
}
|
|
|
|
// When data is outdated, we clear the first buffer and free it as the
|
|
// data has been consumed because of sequential reads.
|
|
//
|
|
// Scenarios for prefetching asynchronously:
|
|
// Case1: If all buffers are in free_bufs_, prefetch n + readahead_size_/2 bytes
|
|
// synchronously in first buffer and prefetch readahead_size_/2 async in
|
|
// remaining buffers (num_buffers_ -1 ).
|
|
// Case2: If first buffer has partial data, prefetch readahead_size_/2 async in
|
|
// remaining buffers. In case of partial data, prefetch remaining bytes
|
|
// from size n synchronously to fulfill the requested bytes request.
|
|
// Case5: (Special case) If data is overlapping in two buffers, copy requested
|
|
// data from first, free that buffer to send for async request, wait for
|
|
// poll to fill next buffer (if any), and copy remaining data from that
|
|
// buffer to overlap buffer.
|
|
Status FilePrefetchBuffer::PrefetchInternal(const IOOptions& opts,
|
|
RandomAccessFileReader* reader,
|
|
uint64_t offset, size_t length,
|
|
size_t readahead_size,
|
|
bool& copy_to_overlap_buffer) {
|
|
if (!enable_) {
|
|
return Status::OK();
|
|
}
|
|
|
|
TEST_SYNC_POINT("FilePrefetchBuffer::Prefetch:Start");
|
|
|
|
size_t alignment = reader->file()->GetRequiredBufferAlignment();
|
|
Status s;
|
|
uint64_t tmp_offset = offset;
|
|
size_t tmp_length = length;
|
|
size_t original_length = length;
|
|
|
|
// Abort outdated IO.
|
|
if (!explicit_prefetch_submitted_) {
|
|
AbortOutdatedIO(offset);
|
|
FreeEmptyBuffers();
|
|
}
|
|
ClearOutdatedData(offset, length);
|
|
|
|
// Handle overlapping data over two buffers.
|
|
s = HandleOverlappingData(opts, reader, offset, length, readahead_size,
|
|
copy_to_overlap_buffer, tmp_offset, tmp_length);
|
|
if (!s.ok()) {
|
|
return s;
|
|
}
|
|
|
|
AllocateBufferIfEmpty();
|
|
BufferInfo* buf = GetFirstBuffer();
|
|
|
|
// Call Poll only if data is needed for the second buffer.
|
|
// - Return if whole data is in first and second buffer is in progress or
|
|
// already full.
|
|
// - If second buffer is empty, it will go for ReadAsync for second buffer.
|
|
if (!buf->async_read_in_progress_ && buf->DoesBufferContainData() &&
|
|
buf->IsDataBlockInBuffer(offset, length)) {
|
|
// Whole data is in buffer.
|
|
if (!IsEligibleForFurtherPrefetching()) {
|
|
UpdateStats(/*found_in_buffer=*/true, original_length);
|
|
return s;
|
|
}
|
|
} else {
|
|
PollIfNeeded(tmp_offset, tmp_length);
|
|
}
|
|
|
|
AllocateBufferIfEmpty();
|
|
buf = GetFirstBuffer();
|
|
offset = tmp_offset;
|
|
length = tmp_length;
|
|
|
|
// After polling, if all the requested bytes are in first buffer, it will only
|
|
// go for async prefetching.
|
|
if (buf->DoesBufferContainData()) {
|
|
if (copy_to_overlap_buffer) {
|
|
// Data is overlapping i.e. some of the data has been copied to overlap
|
|
// buffer and remaining will be updated below.
|
|
size_t initial_buf_size = overlap_buf_->CurrentSize();
|
|
CopyDataToBuffer(buf, offset, length);
|
|
UpdateStats(
|
|
/*found_in_buffer=*/false,
|
|
overlap_buf_->CurrentSize() - initial_buf_size);
|
|
|
|
// Length == 0: All the requested data has been copied to overlap buffer
|
|
// and it has already gone for async prefetching. It can return without
|
|
// doing anything further.
|
|
// Length > 0: More data needs to be consumed so it will continue async
|
|
// and sync prefetching and copy the remaining data to overlap buffer in
|
|
// the end.
|
|
if (length == 0) {
|
|
UpdateStats(/*found_in_buffer=*/true, length);
|
|
return s;
|
|
}
|
|
} else {
|
|
if (buf->IsDataBlockInBuffer(offset, length)) {
|
|
offset += length;
|
|
length = 0;
|
|
// Since async request was submitted directly by calling PrefetchAsync
|
|
// in last call, we don't need to prefetch further as this call is to
|
|
// poll the data submitted in previous call.
|
|
if (explicit_prefetch_submitted_) {
|
|
return s;
|
|
}
|
|
if (!IsEligibleForFurtherPrefetching()) {
|
|
UpdateStats(/*found_in_buffer=*/true, original_length);
|
|
return s;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
AllocateBufferIfEmpty();
|
|
buf = GetFirstBuffer();
|
|
|
|
assert(!buf->async_read_in_progress_);
|
|
|
|
// Go for ReadAsync and Read (if needed).
|
|
// offset and size alignment for first buffer with synchronous prefetching
|
|
uint64_t start_offset1 = offset, end_offset1 = 0, aligned_useful_len1 = 0;
|
|
size_t read_len1 = 0;
|
|
|
|
// For length == 0, skip the synchronous prefetching. read_len1 will be 0.
|
|
if (length > 0) {
|
|
if (buf->IsOffsetInBuffer(offset)) {
|
|
UpdateStats(/*found_in_buffer=*/false,
|
|
(buf->offset_ + buf->CurrentSize() - offset));
|
|
}
|
|
ReadAheadSizeTuning(buf, /*read_curr_block=*/true, /*refit_tail*/
|
|
true, start_offset1, alignment, length, readahead_size,
|
|
start_offset1, end_offset1, read_len1,
|
|
aligned_useful_len1);
|
|
} else {
|
|
UpdateStats(/*found_in_buffer=*/true, original_length);
|
|
}
|
|
|
|
// Prefetch in remaining buffer only if readahead_size > 0.
|
|
if (readahead_size > 0) {
|
|
s = PrefetchRemBuffers(opts, reader, end_offset1, alignment,
|
|
readahead_size);
|
|
if (!s.ok()) {
|
|
return s;
|
|
}
|
|
}
|
|
|
|
if (read_len1 > 0) {
|
|
s = Read(buf, opts, reader, read_len1, aligned_useful_len1, start_offset1);
|
|
if (!s.ok()) {
|
|
AbortAllIOs();
|
|
FreeAllBuffers();
|
|
return s;
|
|
}
|
|
}
|
|
|
|
// Copy remaining requested bytes to overlap_buffer. No need to update stats
|
|
// as data is prefetched during this call.
|
|
if (copy_to_overlap_buffer && length > 0) {
|
|
CopyDataToBuffer(buf, offset, length);
|
|
}
|
|
return s;
|
|
}
|
|
|
|
bool FilePrefetchBuffer::TryReadFromCache(const IOOptions& opts,
|
|
RandomAccessFileReader* reader,
|
|
uint64_t offset, size_t n,
|
|
Slice* result, Status* status,
|
|
bool for_compaction) {
|
|
bool ret = TryReadFromCacheUntracked(opts, reader, offset, n, result, status,
|
|
for_compaction);
|
|
if (usage_ == FilePrefetchBufferUsage::kTableOpenPrefetchTail && enable_) {
|
|
if (ret) {
|
|
RecordTick(stats_, TABLE_OPEN_PREFETCH_TAIL_HIT);
|
|
} else {
|
|
RecordTick(stats_, TABLE_OPEN_PREFETCH_TAIL_MISS);
|
|
}
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
bool FilePrefetchBuffer::TryReadFromCacheUntracked(
|
|
const IOOptions& opts, RandomAccessFileReader* reader, uint64_t offset,
|
|
size_t n, Slice* result, Status* status, bool for_compaction) {
|
|
if (track_min_offset_ && offset < min_offset_read_) {
|
|
min_offset_read_ = static_cast<size_t>(offset);
|
|
}
|
|
|
|
if (!enable_) {
|
|
return false;
|
|
}
|
|
|
|
if (explicit_prefetch_submitted_) {
|
|
// explicit_prefetch_submitted_ is special case where it expects request
|
|
// submitted in PrefetchAsync should match with this request. Otherwise
|
|
// buffers will be outdated.
|
|
// Random offset called. So abort the IOs.
|
|
if (prev_offset_ != offset) {
|
|
AbortAllIOs();
|
|
FreeAllBuffers();
|
|
explicit_prefetch_submitted_ = false;
|
|
return false;
|
|
}
|
|
}
|
|
|
|
AllocateBufferIfEmpty();
|
|
BufferInfo* buf = GetFirstBuffer();
|
|
|
|
if (!explicit_prefetch_submitted_ && offset < buf->offset_) {
|
|
return false;
|
|
}
|
|
|
|
bool prefetched = false;
|
|
bool copy_to_overlap_buffer = false;
|
|
// If the buffer contains only a few of the requested bytes:
|
|
// If readahead is enabled: prefetch the remaining bytes + readahead
|
|
// bytes
|
|
// and satisfy the request.
|
|
// If readahead is not enabled: return false.
|
|
TEST_SYNC_POINT_CALLBACK("FilePrefetchBuffer::TryReadFromCache",
|
|
&readahead_size_);
|
|
|
|
if (explicit_prefetch_submitted_ ||
|
|
(buf->async_read_in_progress_ ||
|
|
offset + n > buf->offset_ + buf->CurrentSize())) {
|
|
// In case readahead_size is trimmed (=0), we still want to poll the data
|
|
// submitted with explicit_prefetch_submitted_=true.
|
|
if (readahead_size_ > 0 || explicit_prefetch_submitted_) {
|
|
Status s;
|
|
assert(reader != nullptr);
|
|
assert(max_readahead_size_ >= readahead_size_);
|
|
|
|
if (for_compaction) {
|
|
s = Prefetch(opts, reader, offset, std::max(n, readahead_size_));
|
|
} else {
|
|
if (implicit_auto_readahead_) {
|
|
if (!IsEligibleForPrefetch(offset, n)) {
|
|
// Ignore status as Prefetch is not called.
|
|
s.PermitUncheckedError();
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// Prefetch n + readahead_size_/2 synchronously as remaining
|
|
// readahead_size_/2 will be prefetched asynchronously if num_buffers_
|
|
// > 1.
|
|
s = PrefetchInternal(
|
|
opts, reader, offset, n,
|
|
(num_buffers_ > 1 ? readahead_size_ / 2 : readahead_size_),
|
|
copy_to_overlap_buffer);
|
|
explicit_prefetch_submitted_ = false;
|
|
}
|
|
|
|
if (!s.ok()) {
|
|
if (status) {
|
|
*status = s;
|
|
}
|
|
#ifndef NDEBUG
|
|
IGNORE_STATUS_IF_ERROR(s);
|
|
#endif
|
|
return false;
|
|
}
|
|
prefetched = explicit_prefetch_submitted_ ? false : true;
|
|
} else {
|
|
return false;
|
|
}
|
|
} else if (!for_compaction) {
|
|
UpdateStats(/*found_in_buffer=*/true, n);
|
|
}
|
|
|
|
UpdateReadPattern(offset, n, /*decrease_readaheadsize=*/false);
|
|
|
|
buf = GetFirstBuffer();
|
|
if (copy_to_overlap_buffer) {
|
|
buf = overlap_buf_;
|
|
}
|
|
uint64_t offset_in_buffer = offset - buf->offset_;
|
|
*result = Slice(buf->buffer_.BufferStart() + offset_in_buffer, n);
|
|
if (prefetched) {
|
|
readahead_size_ = std::min(max_readahead_size_, readahead_size_ * 2);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
void FilePrefetchBuffer::PrefetchAsyncCallback(FSReadRequest& req,
|
|
void* cb_arg) {
|
|
BufferInfo* buf = static_cast<BufferInfo*>(cb_arg);
|
|
|
|
#ifndef NDEBUG
|
|
if (req.result.size() < req.len) {
|
|
// Fake an IO error to force db_stress fault injection to ignore
|
|
// truncated read errors
|
|
IGNORE_STATUS_IF_ERROR(Status::IOError());
|
|
}
|
|
IGNORE_STATUS_IF_ERROR(req.status);
|
|
#endif
|
|
|
|
if (req.status.ok()) {
|
|
if (req.offset + req.result.size() <= buf->offset_ + buf->CurrentSize()) {
|
|
// All requested bytes are already in the buffer or no data is read
|
|
// because of EOF. So no need to update.
|
|
return;
|
|
}
|
|
if (req.offset < buf->offset_) {
|
|
// Next block to be read has changed (Recent read was not a sequential
|
|
// read). So ignore this read.
|
|
return;
|
|
}
|
|
size_t current_size = buf->CurrentSize();
|
|
buf->buffer_.Size(current_size + req.result.size());
|
|
}
|
|
}
|
|
|
|
Status FilePrefetchBuffer::PrefetchAsync(const IOOptions& opts,
|
|
RandomAccessFileReader* reader,
|
|
uint64_t offset, size_t n,
|
|
Slice* result) {
|
|
assert(reader != nullptr);
|
|
if (!enable_) {
|
|
return Status::NotSupported();
|
|
}
|
|
|
|
TEST_SYNC_POINT("FilePrefetchBuffer::PrefetchAsync:Start");
|
|
|
|
num_file_reads_ = 0;
|
|
explicit_prefetch_submitted_ = false;
|
|
bool is_eligible_for_prefetching = false;
|
|
|
|
if (readahead_size_ > 0 &&
|
|
(!implicit_auto_readahead_ ||
|
|
num_file_reads_ >= num_file_reads_for_auto_readahead_)) {
|
|
is_eligible_for_prefetching = true;
|
|
}
|
|
|
|
// Cancel any pending async read to make code simpler as buffers can be out
|
|
// of sync.
|
|
AbortAllIOs();
|
|
// Free empty buffers after aborting IOs.
|
|
FreeEmptyBuffers();
|
|
ClearOutdatedData(offset, n);
|
|
|
|
// - Since PrefetchAsync can be called on non sequential reads. So offset can
|
|
// be less than first buffers' offset. In that case it clears all
|
|
// buffers.
|
|
// - In case of tuning of readahead_size, on Reseek, we have to clear all
|
|
// buffers otherwise, we may end up with inconsistent BlockHandles in queue
|
|
// and data in buffer.
|
|
if (!IsBufferQueueEmpty()) {
|
|
BufferInfo* buf = GetFirstBuffer();
|
|
if (readaheadsize_cb_ != nullptr || !buf->IsOffsetInBuffer(offset)) {
|
|
FreeAllBuffers();
|
|
}
|
|
}
|
|
|
|
UpdateReadPattern(offset, n, /*decrease_readaheadsize=*/false);
|
|
|
|
bool data_found = false;
|
|
|
|
// If first buffer has full data.
|
|
if (!IsBufferQueueEmpty()) {
|
|
BufferInfo* buf = GetFirstBuffer();
|
|
if (buf->DoesBufferContainData() && buf->IsDataBlockInBuffer(offset, n)) {
|
|
uint64_t offset_in_buffer = offset - buf->offset_;
|
|
*result = Slice(buf->buffer_.BufferStart() + offset_in_buffer, n);
|
|
data_found = true;
|
|
UpdateStats(/*found_in_buffer=*/true, n);
|
|
|
|
// Update num_file_reads_ as TryReadFromCacheAsync won't be called for
|
|
// poll and update num_file_reads_ if data is found.
|
|
num_file_reads_++;
|
|
|
|
// If next buffer contains some data or is not eligible for prefetching,
|
|
// return.
|
|
if (!is_eligible_for_prefetching || NumBuffersAllocated() > 1) {
|
|
return Status::OK();
|
|
}
|
|
} else {
|
|
// Partial data in first buffer. Clear it to return continous data in one
|
|
// buffer.
|
|
FreeAllBuffers();
|
|
}
|
|
}
|
|
|
|
std::string msg;
|
|
|
|
Status s;
|
|
size_t alignment = reader->file()->GetRequiredBufferAlignment();
|
|
size_t readahead_size = is_eligible_for_prefetching ? readahead_size_ / 2 : 0;
|
|
size_t offset_to_read = static_cast<size_t>(offset);
|
|
uint64_t start_offset1 = offset, end_offset1 = 0, aligned_useful_len1 = 0;
|
|
size_t read_len1 = 0;
|
|
|
|
AllocateBufferIfEmpty();
|
|
BufferInfo* buf = GetFirstBuffer();
|
|
|
|
// - If first buffer is empty.
|
|
// - Call async read for full data + readahead_size on first buffer.
|
|
// - Call async read for readahead_size on all remaining buffers if
|
|
// eligible.
|
|
// - If first buffer contains data,
|
|
// - Call async read for readahead_size on all remaining buffers if
|
|
// eligible.
|
|
|
|
// Calculate length and offsets for reading.
|
|
if (!buf->DoesBufferContainData()) {
|
|
uint64_t roundup_len1;
|
|
// Prefetch full data + readahead_size in the first buffer.
|
|
if (is_eligible_for_prefetching || reader->use_direct_io()) {
|
|
ReadAheadSizeTuning(buf, /*read_curr_block=*/true, /*refit_tail=*/false,
|
|
/*prev_buf_end_offset=*/start_offset1, alignment, n,
|
|
readahead_size, start_offset1, end_offset1, read_len1,
|
|
aligned_useful_len1);
|
|
} else {
|
|
// No alignment or extra prefetching.
|
|
start_offset1 = offset_to_read;
|
|
end_offset1 = offset_to_read + n;
|
|
roundup_len1 = end_offset1 - start_offset1;
|
|
PrepareBufferForRead(buf, alignment, start_offset1, roundup_len1, false,
|
|
aligned_useful_len1);
|
|
assert(aligned_useful_len1 == 0);
|
|
assert(roundup_len1 >= aligned_useful_len1);
|
|
read_len1 = static_cast<size_t>(roundup_len1);
|
|
buf->offset_ = start_offset1;
|
|
}
|
|
|
|
if (read_len1 > 0) {
|
|
s = ReadAsync(buf, opts, reader, read_len1, start_offset1);
|
|
if (!s.ok()) {
|
|
DestroyAndClearIOHandle(buf);
|
|
FreeLastBuffer();
|
|
return s;
|
|
}
|
|
explicit_prefetch_submitted_ = true;
|
|
prev_len_ = 0;
|
|
}
|
|
}
|
|
|
|
if (is_eligible_for_prefetching) {
|
|
s = PrefetchRemBuffers(opts, reader, end_offset1, alignment,
|
|
readahead_size);
|
|
if (!s.ok()) {
|
|
return s;
|
|
}
|
|
readahead_size_ = std::min(max_readahead_size_, readahead_size_ * 2);
|
|
}
|
|
return (data_found ? Status::OK() : Status::TryAgain());
|
|
}
|
|
|
|
Status FilePrefetchBuffer::PrefetchRemBuffers(const IOOptions& opts,
|
|
RandomAccessFileReader* reader,
|
|
uint64_t end_offset1,
|
|
size_t alignment,
|
|
size_t readahead_size) {
|
|
Status s;
|
|
while (NumBuffersAllocated() < num_buffers_) {
|
|
BufferInfo* prev_buf = GetLastBuffer();
|
|
uint64_t start_offset2 = prev_buf->initial_end_offset_;
|
|
|
|
AllocateBuffer();
|
|
BufferInfo* new_buf = GetLastBuffer();
|
|
|
|
uint64_t end_offset2 = start_offset2, aligned_useful_len2 = 0;
|
|
size_t read_len2 = 0;
|
|
ReadAheadSizeTuning(new_buf, /*read_curr_block=*/false,
|
|
/*refit_tail=*/false,
|
|
/*prev_buf_end_offset=*/end_offset1, alignment,
|
|
/*length=*/0, readahead_size, start_offset2,
|
|
end_offset2, read_len2, aligned_useful_len2);
|
|
|
|
if (read_len2 > 0) {
|
|
TEST_SYNC_POINT("FilePrefetchBuffer::PrefetchAsync:ExtraPrefetching");
|
|
s = ReadAsync(new_buf, opts, reader, read_len2, start_offset2);
|
|
if (!s.ok()) {
|
|
DestroyAndClearIOHandle(new_buf);
|
|
FreeLastBuffer();
|
|
return s;
|
|
}
|
|
}
|
|
end_offset1 = end_offset2;
|
|
}
|
|
return s;
|
|
}
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|