rocksdb/table/block_based/block_prefetcher.cc
Hui Xiao 719f5511f6 No file system prefetching when Options::compaction_readahead_size is 0 (#11887)
Summary:
**Context/Summary:**

https://github.com/facebook/rocksdb/pull/11631 introduced `readahead()` system call for compaction read under non direct IO. When `Options::compaction_readahead_size` is 0, the `readahead()` will issued with a small size (i.e, the block size, by default 4KB)

Benchmarks shows that such readahead() call regresses the compaction read compared with "no readahead()" case (see Test Plan for more).

Therefore we decided to not issue such `readhead() ` when `Options::compaction_readahead_size` is 0.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/11887

Test Plan:
Settings: `compaction_readahead_size = 0, use_direct_reads=false`
Setup:
```
TEST_TMPDIR=../ ./db_bench -benchmarks=filluniquerandom -disable_auto_compactions=true -write_buffer_size=1048576 -compression_type=none -value_size=10240 && tar -cf ../dbbench.tar -C ../dbbench/ .
```
Run:
```
for i in $(seq 3); do rm -rf ../dbbench/ && mkdir -p ../dbbench/ && tar -xf ../dbbench.tar -C ../dbbench/ . && sudo bash -c 'sync && echo 3 > /proc/sys/vm/drop_caches' && TEST_TMPDIR=../ /usr/bin/time ./db_bench_{pre_PR11631|PR11631|PR11631_with_improvementPR11887} -benchmarks=compact -use_existing_db=true -db=../dbbench/ -disable_auto_compactions=true -compression_type=none ; done |& grep elapsed
```

pre-PR11631("no readahead()" case):

PR11631:

PR11631+this improvement:

Reviewed By: ajkr

Differential Revision: D49607266

Pulled By: hx235

fbshipit-source-id: 2efa0dc91bac3c11cc2be057c53d894645f683ef
2023-09-26 10:08:43 -07:00

150 lines
5.7 KiB
C++

// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "table/block_based/block_prefetcher.h"
#include "rocksdb/file_system.h"
#include "table/block_based/block_based_table_reader.h"
namespace ROCKSDB_NAMESPACE {
void BlockPrefetcher::PrefetchIfNeeded(
const BlockBasedTable::Rep* rep, const BlockHandle& handle,
const size_t readahead_size, bool is_for_compaction,
const bool no_sequential_checking, const ReadOptions& read_options,
const std::function<void(uint64_t, size_t, size_t&)>& readaheadsize_cb) {
const size_t len = BlockBasedTable::BlockSizeWithTrailer(handle);
const size_t offset = handle.offset();
if (is_for_compaction) {
if (!rep->file->use_direct_io() && compaction_readahead_size_ > 0) {
// If FS supports prefetching (readahead_limit_ will be non zero in that
// case) and current block exists in prefetch buffer then return.
if (offset + len <= readahead_limit_) {
return;
}
IOOptions opts;
Status s = rep->file->PrepareIOOptions(read_options, opts);
if (!s.ok()) {
return;
}
s = rep->file->Prefetch(opts, offset, len + compaction_readahead_size_);
if (s.ok()) {
readahead_limit_ = offset + len + compaction_readahead_size_;
return;
}
}
// If FS prefetch is not supported, fall back to use internal prefetch
// buffer. Discarding other return status of Prefetch calls intentionally,
// as we can fallback to reading from disk if Prefetch fails.
//
// num_file_reads is used by FilePrefetchBuffer only when
// implicit_auto_readahead is set.
rep->CreateFilePrefetchBufferIfNotExists(
compaction_readahead_size_, compaction_readahead_size_,
&prefetch_buffer_, /*implicit_auto_readahead=*/false,
/*num_file_reads=*/0, /*num_file_reads_for_auto_readahead=*/0,
/*upper_bound_offset=*/0, /*readaheadsize_cb=*/nullptr);
return;
}
// Explicit user requested readahead.
if (readahead_size > 0) {
rep->CreateFilePrefetchBufferIfNotExists(
readahead_size, readahead_size, &prefetch_buffer_,
/*implicit_auto_readahead=*/false, /*num_file_reads=*/0,
/*num_file_reads_for_auto_readahead=*/0, upper_bound_offset_,
readaheadsize_cb);
return;
}
// Implicit readahead.
// If max_auto_readahead_size is set to be 0 by user, no data will be
// prefetched.
size_t max_auto_readahead_size = rep->table_options.max_auto_readahead_size;
if (max_auto_readahead_size == 0 || initial_auto_readahead_size_ == 0) {
return;
}
if (initial_auto_readahead_size_ > max_auto_readahead_size) {
initial_auto_readahead_size_ = max_auto_readahead_size;
}
// In case of no_sequential_checking, it will skip the num_file_reads_ and
// will always creates the FilePrefetchBuffer.
if (no_sequential_checking) {
rep->CreateFilePrefetchBufferIfNotExists(
initial_auto_readahead_size_, max_auto_readahead_size,
&prefetch_buffer_, /*implicit_auto_readahead=*/true,
/*num_file_reads=*/0,
rep->table_options.num_file_reads_for_auto_readahead,
upper_bound_offset_, readaheadsize_cb);
return;
}
// If FS supports prefetching (readahead_limit_ will be non zero in that case)
// and current block exists in prefetch buffer then return.
if (offset + len <= readahead_limit_) {
UpdateReadPattern(offset, len);
return;
}
if (!IsBlockSequential(offset)) {
UpdateReadPattern(offset, len);
ResetValues(rep->table_options.initial_auto_readahead_size);
return;
}
UpdateReadPattern(offset, len);
// Implicit auto readahead, which will be enabled if the number of reads
// reached `table_options.num_file_reads_for_auto_readahead` (default: 2) and
// scans are sequential.
num_file_reads_++;
if (num_file_reads_ <= rep->table_options.num_file_reads_for_auto_readahead) {
return;
}
if (rep->file->use_direct_io()) {
rep->CreateFilePrefetchBufferIfNotExists(
initial_auto_readahead_size_, max_auto_readahead_size,
&prefetch_buffer_, /*implicit_auto_readahead=*/true, num_file_reads_,
rep->table_options.num_file_reads_for_auto_readahead,
upper_bound_offset_, readaheadsize_cb);
return;
}
if (readahead_size_ > max_auto_readahead_size) {
readahead_size_ = max_auto_readahead_size;
}
// If prefetch is not supported, fall back to use internal prefetch buffer.
// Discarding other return status of Prefetch calls intentionally, as
// we can fallback to reading from disk if Prefetch fails.
IOOptions opts;
Status s = rep->file->PrepareIOOptions(read_options, opts);
if (!s.ok()) {
return;
}
s = rep->file->Prefetch(
opts, handle.offset(),
BlockBasedTable::BlockSizeWithTrailer(handle) + readahead_size_);
if (s.IsNotSupported()) {
rep->CreateFilePrefetchBufferIfNotExists(
initial_auto_readahead_size_, max_auto_readahead_size,
&prefetch_buffer_, /*implicit_auto_readahead=*/true, num_file_reads_,
rep->table_options.num_file_reads_for_auto_readahead,
upper_bound_offset_, readaheadsize_cb);
return;
}
readahead_limit_ = offset + len + readahead_size_;
// Keep exponentially increasing readahead size until
// max_auto_readahead_size.
readahead_size_ = std::min(max_auto_readahead_size, readahead_size_ * 2);
}
} // namespace ROCKSDB_NAMESPACE