2019-09-16 17:31:27 +00:00
|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
//
|
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
|
|
|
|
#pragma once
|
2022-11-01 23:06:51 +00:00
|
|
|
|
2021-11-11 00:18:27 +00:00
|
|
|
#include <algorithm>
|
2019-09-16 17:31:27 +00:00
|
|
|
#include <atomic>
|
2024-01-05 17:29:01 +00:00
|
|
|
#include <deque>
|
2019-09-16 17:31:27 +00:00
|
|
|
#include <sstream>
|
|
|
|
#include <string>
|
2020-06-29 21:51:57 +00:00
|
|
|
|
2021-11-11 00:18:27 +00:00
|
|
|
#include "file/readahead_file_info.h"
|
2023-05-17 18:27:09 +00:00
|
|
|
#include "monitoring/statistics_impl.h"
|
2019-09-16 17:31:27 +00:00
|
|
|
#include "port/port.h"
|
|
|
|
#include "rocksdb/env.h"
|
2022-03-21 14:12:43 +00:00
|
|
|
#include "rocksdb/file_system.h"
|
2020-06-29 21:51:57 +00:00
|
|
|
#include "rocksdb/options.h"
|
2019-09-16 17:31:27 +00:00
|
|
|
#include "util/aligned_buffer.h"
|
2022-09-13 00:42:01 +00:00
|
|
|
#include "util/autovector.h"
|
2022-08-29 21:37:44 +00:00
|
|
|
#include "util/stop_watch.h"
|
2019-09-16 17:31:27 +00:00
|
|
|
|
2020-02-20 20:07:53 +00:00
|
|
|
namespace ROCKSDB_NAMESPACE {
|
2019-09-16 17:31:27 +00:00
|
|
|
|
2022-09-13 00:42:01 +00:00
|
|
|
#define DEFAULT_DECREMENT 8 * 1024
|
2021-11-11 00:18:27 +00:00
|
|
|
|
2021-11-20 01:52:42 +00:00
|
|
|
struct IOOptions;
|
|
|
|
class RandomAccessFileReader;
|
|
|
|
|
2024-01-05 17:29:01 +00:00
|
|
|
struct ReadaheadParams {
|
|
|
|
ReadaheadParams() {}
|
|
|
|
|
|
|
|
// The initial readahead size.
|
|
|
|
size_t initial_readahead_size = 0;
|
|
|
|
|
|
|
|
// The maximum readahead size.
|
|
|
|
// If max_readahead_size > readahead_size, then readahead size will be doubled
|
|
|
|
// on every IO until max_readahead_size is hit. Typically this is set as a
|
|
|
|
// multiple of initial_readahead_size. initial_readahead_size should be
|
|
|
|
// greater than equal to initial_readahead_size.
|
|
|
|
size_t max_readahead_size = 0;
|
|
|
|
|
|
|
|
// If true, Readahead is enabled implicitly by rocksdb
|
|
|
|
// after doing sequential scans for num_file_reads_for_auto_readahead.
|
|
|
|
bool implicit_auto_readahead = false;
|
|
|
|
|
|
|
|
// TODO akanksha - Remove num_file_reads when BlockPrefetcher is refactored.
|
|
|
|
uint64_t num_file_reads = 0;
|
|
|
|
uint64_t num_file_reads_for_auto_readahead = 0;
|
|
|
|
|
|
|
|
// Number of buffers to maintain that contains prefetched data. If num_buffers
|
|
|
|
// > 1 then buffers will be filled asynchronously whenever they get emptied.
|
|
|
|
size_t num_buffers = 1;
|
|
|
|
};
|
|
|
|
|
2022-03-21 14:12:43 +00:00
|
|
|
struct BufferInfo {
|
2023-12-06 21:48:15 +00:00
|
|
|
void ClearBuffer() {
|
|
|
|
buffer_.Clear();
|
|
|
|
initial_end_offset_ = 0;
|
2024-01-05 17:29:01 +00:00
|
|
|
async_req_len_ = 0;
|
2023-12-06 21:48:15 +00:00
|
|
|
}
|
|
|
|
|
2022-03-21 14:12:43 +00:00
|
|
|
AlignedBuffer buffer_;
|
2022-09-13 00:42:01 +00:00
|
|
|
|
2022-03-21 14:12:43 +00:00
|
|
|
uint64_t offset_ = 0;
|
2022-09-13 00:42:01 +00:00
|
|
|
|
|
|
|
// Below parameters are used in case of async read flow.
|
|
|
|
// Length requested for in ReadAsync.
|
|
|
|
size_t async_req_len_ = 0;
|
|
|
|
|
|
|
|
// async_read_in_progress can be used as mutex. Callback can update the buffer
|
|
|
|
// and its size but async_read_in_progress is only set by main thread.
|
|
|
|
bool async_read_in_progress_ = false;
|
|
|
|
|
|
|
|
// io_handle is allocated and used by underlying file system in case of
|
|
|
|
// asynchronous reads.
|
|
|
|
void* io_handle_ = nullptr;
|
|
|
|
|
|
|
|
IOHandleDeleter del_fn_ = nullptr;
|
|
|
|
|
2023-12-06 21:48:15 +00:00
|
|
|
// initial_end_offset is used to keep track of the end offset of the buffer
|
|
|
|
// that was originally called. It's helpful in case of autotuning of readahead
|
|
|
|
// size when callback is made to BlockBasedTableIterator.
|
|
|
|
// initial end offset of this buffer which will be the starting
|
|
|
|
// offset of next prefetch.
|
|
|
|
//
|
|
|
|
// For example - if end offset of previous buffer was 100 and because of
|
|
|
|
// readahead_size optimization, end_offset was trimmed to 60. Then for next
|
|
|
|
// prefetch call, start_offset should be intialized to 100 i.e start_offset =
|
|
|
|
// buf->initial_end_offset_.
|
|
|
|
uint64_t initial_end_offset_ = 0;
|
2024-01-05 17:29:01 +00:00
|
|
|
|
|
|
|
bool IsDataBlockInBuffer(uint64_t offset, size_t length) {
|
|
|
|
assert(async_read_in_progress_ == false);
|
|
|
|
return (offset >= offset_ &&
|
|
|
|
offset + length <= offset_ + buffer_.CurrentSize());
|
|
|
|
}
|
|
|
|
|
|
|
|
bool IsOffsetInBuffer(uint64_t offset) {
|
|
|
|
assert(async_read_in_progress_ == false);
|
|
|
|
return (offset >= offset_ && offset < offset_ + buffer_.CurrentSize());
|
|
|
|
}
|
|
|
|
|
|
|
|
bool DoesBufferContainData() {
|
|
|
|
assert(async_read_in_progress_ == false);
|
|
|
|
return buffer_.CurrentSize() > 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool IsBufferOutdated(uint64_t offset) {
|
|
|
|
return (!async_read_in_progress_ && DoesBufferContainData() &&
|
|
|
|
offset >= offset_ + buffer_.CurrentSize());
|
|
|
|
}
|
|
|
|
|
|
|
|
bool IsBufferOutdatedWithAsyncProgress(uint64_t offset) {
|
|
|
|
return (async_read_in_progress_ && io_handle_ != nullptr &&
|
|
|
|
offset >= offset_ + async_req_len_);
|
|
|
|
}
|
|
|
|
|
|
|
|
bool IsOffsetInBufferWithAsyncProgress(uint64_t offset) {
|
|
|
|
return (async_read_in_progress_ && offset >= offset_ &&
|
|
|
|
offset < offset_ + async_req_len_);
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t CurrentSize() { return buffer_.CurrentSize(); }
|
2022-03-21 14:12:43 +00:00
|
|
|
};
|
|
|
|
|
Add new stat rocksdb.table.open.prefetch.tail.read.bytes, rocksdb.table.open.prefetch.tail.{miss|hit} (#11265)
Summary:
**Context/Summary:**
We are adding new stats to measure behavior of prefetched tail size and look up into this buffer
The stat collection is done in FilePrefetchBuffer but only for prefetched tail buffer during table open for now using FilePrefetchBuffer enum. It's cleaner than the alternative of implementing in upper-level call places of FilePrefetchBuffer for table open. It also has the benefit of extensible to other types of FilePrefetchBuffer if needed. See db bench for perf regression concern.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11265
Test Plan:
**- Piggyback on existing test**
**- rocksdb.table.open.prefetch.tail.miss is harder to UT so I manually set prefetch tail read bytes to be small and run db bench.**
```
./db_bench -db=/tmp/testdb -statistics=true -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000 -write_buffer_size=655 -target_file_size_base=655 -disable_auto_compactions=false -compression_type=none -bloom_bits=3 -use_direct_reads=true
```
```
rocksdb.table.open.prefetch.tail.read.bytes P50 : 4096.000000 P95 : 4096.000000 P99 : 4096.000000 P100 : 4096.000000 COUNT : 225 SUM : 921600
rocksdb.table.open.prefetch.tail.miss COUNT : 91
rocksdb.table.open.prefetch.tail.hit COUNT : 1034
```
**- No perf regression observed in db_bench**
SETUP command: create same db with ~900 files for pre-change/post-change.
```
./db_bench -db=/tmp/testdb -benchmarks="fillseq" -key_size=32 -value_size=512 -num=500000 -write_buffer_size=655360 -disable_auto_compactions=true -target_file_size_base=16777216 -compression_type=none
```
TEST command 60 runs or til convergence: as suggested by anand1976 and akankshamahajan15, vary `seek_nexts` and `async_io` in testing.
```
./db_bench -use_existing_db=true -db=/tmp/testdb -statistics=false -cache_size=0 -cache_index_and_filter_blocks=false -benchmarks=seekrandom[-X60] -num=50000 -seek_nexts={10, 500, 1000} -async_io={0|1} -use_direct_reads=true
```
async io = 0, direct io read = true
| seek_nexts = 10, 30 runs | seek_nexts = 500, 12 runs | seek_nexts = 1000, 6 runs
-- | -- | -- | --
pre-post change | 4776 (± 28) ops/sec; 24.8 (± 0.1) MB/sec | 288 (± 1) ops/sec; 74.8 (± 0.4) MB/sec | 145 (± 4) ops/sec; 75.6 (± 2.2) MB/sec
post-change | 4790 (± 32) ops/sec; 24.9 (± 0.2) MB/sec | 288 (± 3) ops/sec; 74.7 (± 0.8) MB/sec | 143 (± 3) ops/sec; 74.5 (± 1.6) MB/sec
async io = 1, direct io read = true
| seek_nexts = 10, 54 runs | seek_nexts = 500, 6 runs | seek_nexts = 1000, 4 runs
-- | -- | -- | --
pre-post change | 3350 (± 36) ops/sec; 17.4 (± 0.2) MB/sec | 264 (± 0) ops/sec; 68.7 (± 0.2) MB/sec | 138 (± 1) ops/sec; 71.8 (± 1.0) MB/sec
post-change | 3358 (± 27) ops/sec; 17.4 (± 0.1) MB/sec | 263 (± 2) ops/sec; 68.3 (± 0.8) MB/sec | 139 (± 1) ops/sec; 72.6 (± 0.6) MB/sec
Reviewed By: ajkr
Differential Revision: D43781467
Pulled By: hx235
fbshipit-source-id: a706a18472a8edb2b952bac3af40eec803537f2a
2023-03-15 21:02:43 +00:00
|
|
|
enum class FilePrefetchBufferUsage {
|
|
|
|
kTableOpenPrefetchTail,
|
2023-10-23 21:42:44 +00:00
|
|
|
kUserScanPrefetch,
|
Add new stat rocksdb.table.open.prefetch.tail.read.bytes, rocksdb.table.open.prefetch.tail.{miss|hit} (#11265)
Summary:
**Context/Summary:**
We are adding new stats to measure behavior of prefetched tail size and look up into this buffer
The stat collection is done in FilePrefetchBuffer but only for prefetched tail buffer during table open for now using FilePrefetchBuffer enum. It's cleaner than the alternative of implementing in upper-level call places of FilePrefetchBuffer for table open. It also has the benefit of extensible to other types of FilePrefetchBuffer if needed. See db bench for perf regression concern.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11265
Test Plan:
**- Piggyback on existing test**
**- rocksdb.table.open.prefetch.tail.miss is harder to UT so I manually set prefetch tail read bytes to be small and run db bench.**
```
./db_bench -db=/tmp/testdb -statistics=true -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000 -write_buffer_size=655 -target_file_size_base=655 -disable_auto_compactions=false -compression_type=none -bloom_bits=3 -use_direct_reads=true
```
```
rocksdb.table.open.prefetch.tail.read.bytes P50 : 4096.000000 P95 : 4096.000000 P99 : 4096.000000 P100 : 4096.000000 COUNT : 225 SUM : 921600
rocksdb.table.open.prefetch.tail.miss COUNT : 91
rocksdb.table.open.prefetch.tail.hit COUNT : 1034
```
**- No perf regression observed in db_bench**
SETUP command: create same db with ~900 files for pre-change/post-change.
```
./db_bench -db=/tmp/testdb -benchmarks="fillseq" -key_size=32 -value_size=512 -num=500000 -write_buffer_size=655360 -disable_auto_compactions=true -target_file_size_base=16777216 -compression_type=none
```
TEST command 60 runs or til convergence: as suggested by anand1976 and akankshamahajan15, vary `seek_nexts` and `async_io` in testing.
```
./db_bench -use_existing_db=true -db=/tmp/testdb -statistics=false -cache_size=0 -cache_index_and_filter_blocks=false -benchmarks=seekrandom[-X60] -num=50000 -seek_nexts={10, 500, 1000} -async_io={0|1} -use_direct_reads=true
```
async io = 0, direct io read = true
| seek_nexts = 10, 30 runs | seek_nexts = 500, 12 runs | seek_nexts = 1000, 6 runs
-- | -- | -- | --
pre-post change | 4776 (± 28) ops/sec; 24.8 (± 0.1) MB/sec | 288 (± 1) ops/sec; 74.8 (± 0.4) MB/sec | 145 (± 4) ops/sec; 75.6 (± 2.2) MB/sec
post-change | 4790 (± 32) ops/sec; 24.9 (± 0.2) MB/sec | 288 (± 3) ops/sec; 74.7 (± 0.8) MB/sec | 143 (± 3) ops/sec; 74.5 (± 1.6) MB/sec
async io = 1, direct io read = true
| seek_nexts = 10, 54 runs | seek_nexts = 500, 6 runs | seek_nexts = 1000, 4 runs
-- | -- | -- | --
pre-post change | 3350 (± 36) ops/sec; 17.4 (± 0.2) MB/sec | 264 (± 0) ops/sec; 68.7 (± 0.2) MB/sec | 138 (± 1) ops/sec; 71.8 (± 1.0) MB/sec
post-change | 3358 (± 27) ops/sec; 17.4 (± 0.1) MB/sec | 263 (± 2) ops/sec; 68.3 (± 0.8) MB/sec | 139 (± 1) ops/sec; 72.6 (± 0.6) MB/sec
Reviewed By: ajkr
Differential Revision: D43781467
Pulled By: hx235
fbshipit-source-id: a706a18472a8edb2b952bac3af40eec803537f2a
2023-03-15 21:02:43 +00:00
|
|
|
kUnknown,
|
|
|
|
};
|
|
|
|
|
2024-01-05 17:29:01 +00:00
|
|
|
// Implementation:
|
|
|
|
// FilePrefetchBuffer maintains a dequeu of free buffers (free_bufs_) with no
|
|
|
|
// data and bufs_ which contains the prefetched data. Whenever a buffer is
|
|
|
|
// consumed or is outdated (w.r.t. to requested offset), that buffer is cleared
|
|
|
|
// and returned to free_bufs_.
|
|
|
|
//
|
|
|
|
// If a buffer is available in free_bufs_, it's moved to bufs_ and is sent for
|
|
|
|
// prefetching.
|
|
|
|
// num_buffers_ defines how many buffers FilePrefetchBuffer can maintain at a
|
|
|
|
// time that contains prefetched data with num_buffers_ == bufs_.size() +
|
|
|
|
// free_bufs_.size().
|
|
|
|
//
|
|
|
|
// If num_buffers_ == 1, it's a sequential read flow. Read API will be called on
|
|
|
|
// that one buffer whenever the data is requested and is not in the buffer.
|
|
|
|
// If num_buffers_ > 1, then the data is prefetched asynchronosuly in the
|
|
|
|
// buffers whenever the data is consumed from the buffers and that buffer is
|
|
|
|
// freed.
|
|
|
|
// If num_buffers > 1, then requested data can be overlapping between 2 buffers.
|
|
|
|
// To return the continuous buffer, overlap_buf_ is used. The requested data is
|
|
|
|
// copied from 2 buffers to the overlap_buf_ and overlap_buf_ is returned to
|
|
|
|
// the caller.
|
|
|
|
|
2019-09-16 17:31:27 +00:00
|
|
|
// FilePrefetchBuffer is a smart buffer to store and read data from a file.
|
|
|
|
class FilePrefetchBuffer {
|
|
|
|
public:
|
|
|
|
// Constructor.
|
|
|
|
//
|
|
|
|
// All arguments are optional.
|
2024-01-05 17:29:01 +00:00
|
|
|
// ReadaheadParams : Parameters to control the readahead behavior.
|
|
|
|
// enable : controls whether reading from the buffer is enabled.
|
|
|
|
// If false, TryReadFromCache() always return false, and we
|
|
|
|
// only take stats for the minimum offset if
|
|
|
|
// track_min_offset = true.
|
|
|
|
// See below NOTE about mmap reads.
|
2019-09-16 17:31:27 +00:00
|
|
|
// track_min_offset : Track the minimum offset ever read and collect stats on
|
2024-01-05 17:29:01 +00:00
|
|
|
// it. Used for adaptable readahead of the file
|
|
|
|
// footer/metadata.
|
2019-09-16 17:31:27 +00:00
|
|
|
//
|
|
|
|
// A user can construct a FilePrefetchBuffer without any arguments, but use
|
|
|
|
// `Prefetch` to load data into the buffer.
|
Fix and defend against FilePrefetchBuffer combined with mmap reads (#12206)
Summary:
FilePrefetchBuffer makes an unchecked assumption about the behavior of RandomAccessFileReader::Read: that it will write to the provided buffer rather than returning the data in an alternate buffer. FilePrefetchBuffer has been quietly incompatible with mmap reads (e.g. allow_mmap_reads / use_mmap_reads) because in that case an alternate buffer is returned (mmapped memory). This incompatibility currently leads to quiet data corruption, as seen in amplified crash test failure in https://github.com/facebook/rocksdb/issues/12200.
In this change,
* Check whether RandomAccessFileReader::Read has the expected behavior, and fail if not. (Assertion failure in debug build, return Corruption in release build.) This will detect future regressions synchronously and precisely, rather than relying on debugging downstream data corruption.
* Why not recover? My understanding is that FilePrefetchBuffer is not intended for use when RandomAccessFileReader::Read uses an alternate buffer, so quietly recovering could lead to undesirable (inefficient) behavior.
* Mention incompatibility with mmap-based readers in the internal API comments for FilePrefetchBuffer
* Fix two cases where FilePrefetchBuffer could be used with mmap, both stemming from SstFileDumper, though one fix is in BlockBasedTableReader. There is currently no way to ask a RandomAccessFileReader whether it's using mmap, so we currently have to rely on other options as clues.
Keeping separate from https://github.com/facebook/rocksdb/issues/12200 in part because this change is more appropriate for backport than that one.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12206
Test Plan:
* Manually verified that the new check aids in debugging.
* Unit test added, that fails if either fix is missed.
* Ran blackbox_crash_test for hours, with and without https://github.com/facebook/rocksdb/issues/12200
Reviewed By: akankshamahajan15
Differential Revision: D52551701
Pulled By: pdillinger
fbshipit-source-id: dea87c5782b7c484a6c6e424585c8832dfc580dc
2024-01-05 02:39:05 +00:00
|
|
|
// NOTE: FilePrefetchBuffer is incompatible with prefetching from
|
|
|
|
// RandomAccessFileReaders using mmap reads, so it is common to use
|
|
|
|
// `!use_mmap_reads` for the `enable` parameter.
|
Add new stat rocksdb.table.open.prefetch.tail.read.bytes, rocksdb.table.open.prefetch.tail.{miss|hit} (#11265)
Summary:
**Context/Summary:**
We are adding new stats to measure behavior of prefetched tail size and look up into this buffer
The stat collection is done in FilePrefetchBuffer but only for prefetched tail buffer during table open for now using FilePrefetchBuffer enum. It's cleaner than the alternative of implementing in upper-level call places of FilePrefetchBuffer for table open. It also has the benefit of extensible to other types of FilePrefetchBuffer if needed. See db bench for perf regression concern.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11265
Test Plan:
**- Piggyback on existing test**
**- rocksdb.table.open.prefetch.tail.miss is harder to UT so I manually set prefetch tail read bytes to be small and run db bench.**
```
./db_bench -db=/tmp/testdb -statistics=true -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000 -write_buffer_size=655 -target_file_size_base=655 -disable_auto_compactions=false -compression_type=none -bloom_bits=3 -use_direct_reads=true
```
```
rocksdb.table.open.prefetch.tail.read.bytes P50 : 4096.000000 P95 : 4096.000000 P99 : 4096.000000 P100 : 4096.000000 COUNT : 225 SUM : 921600
rocksdb.table.open.prefetch.tail.miss COUNT : 91
rocksdb.table.open.prefetch.tail.hit COUNT : 1034
```
**- No perf regression observed in db_bench**
SETUP command: create same db with ~900 files for pre-change/post-change.
```
./db_bench -db=/tmp/testdb -benchmarks="fillseq" -key_size=32 -value_size=512 -num=500000 -write_buffer_size=655360 -disable_auto_compactions=true -target_file_size_base=16777216 -compression_type=none
```
TEST command 60 runs or til convergence: as suggested by anand1976 and akankshamahajan15, vary `seek_nexts` and `async_io` in testing.
```
./db_bench -use_existing_db=true -db=/tmp/testdb -statistics=false -cache_size=0 -cache_index_and_filter_blocks=false -benchmarks=seekrandom[-X60] -num=50000 -seek_nexts={10, 500, 1000} -async_io={0|1} -use_direct_reads=true
```
async io = 0, direct io read = true
| seek_nexts = 10, 30 runs | seek_nexts = 500, 12 runs | seek_nexts = 1000, 6 runs
-- | -- | -- | --
pre-post change | 4776 (± 28) ops/sec; 24.8 (± 0.1) MB/sec | 288 (± 1) ops/sec; 74.8 (± 0.4) MB/sec | 145 (± 4) ops/sec; 75.6 (± 2.2) MB/sec
post-change | 4790 (± 32) ops/sec; 24.9 (± 0.2) MB/sec | 288 (± 3) ops/sec; 74.7 (± 0.8) MB/sec | 143 (± 3) ops/sec; 74.5 (± 1.6) MB/sec
async io = 1, direct io read = true
| seek_nexts = 10, 54 runs | seek_nexts = 500, 6 runs | seek_nexts = 1000, 4 runs
-- | -- | -- | --
pre-post change | 3350 (± 36) ops/sec; 17.4 (± 0.2) MB/sec | 264 (± 0) ops/sec; 68.7 (± 0.2) MB/sec | 138 (± 1) ops/sec; 71.8 (± 1.0) MB/sec
post-change | 3358 (± 27) ops/sec; 17.4 (± 0.1) MB/sec | 263 (± 2) ops/sec; 68.3 (± 0.8) MB/sec | 139 (± 1) ops/sec; 72.6 (± 0.6) MB/sec
Reviewed By: ajkr
Differential Revision: D43781467
Pulled By: hx235
fbshipit-source-id: a706a18472a8edb2b952bac3af40eec803537f2a
2023-03-15 21:02:43 +00:00
|
|
|
FilePrefetchBuffer(
|
2024-01-05 17:29:01 +00:00
|
|
|
const ReadaheadParams& readahead_params = {}, bool enable = true,
|
|
|
|
bool track_min_offset = false, FileSystem* fs = nullptr,
|
Add new stat rocksdb.table.open.prefetch.tail.read.bytes, rocksdb.table.open.prefetch.tail.{miss|hit} (#11265)
Summary:
**Context/Summary:**
We are adding new stats to measure behavior of prefetched tail size and look up into this buffer
The stat collection is done in FilePrefetchBuffer but only for prefetched tail buffer during table open for now using FilePrefetchBuffer enum. It's cleaner than the alternative of implementing in upper-level call places of FilePrefetchBuffer for table open. It also has the benefit of extensible to other types of FilePrefetchBuffer if needed. See db bench for perf regression concern.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11265
Test Plan:
**- Piggyback on existing test**
**- rocksdb.table.open.prefetch.tail.miss is harder to UT so I manually set prefetch tail read bytes to be small and run db bench.**
```
./db_bench -db=/tmp/testdb -statistics=true -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000 -write_buffer_size=655 -target_file_size_base=655 -disable_auto_compactions=false -compression_type=none -bloom_bits=3 -use_direct_reads=true
```
```
rocksdb.table.open.prefetch.tail.read.bytes P50 : 4096.000000 P95 : 4096.000000 P99 : 4096.000000 P100 : 4096.000000 COUNT : 225 SUM : 921600
rocksdb.table.open.prefetch.tail.miss COUNT : 91
rocksdb.table.open.prefetch.tail.hit COUNT : 1034
```
**- No perf regression observed in db_bench**
SETUP command: create same db with ~900 files for pre-change/post-change.
```
./db_bench -db=/tmp/testdb -benchmarks="fillseq" -key_size=32 -value_size=512 -num=500000 -write_buffer_size=655360 -disable_auto_compactions=true -target_file_size_base=16777216 -compression_type=none
```
TEST command 60 runs or til convergence: as suggested by anand1976 and akankshamahajan15, vary `seek_nexts` and `async_io` in testing.
```
./db_bench -use_existing_db=true -db=/tmp/testdb -statistics=false -cache_size=0 -cache_index_and_filter_blocks=false -benchmarks=seekrandom[-X60] -num=50000 -seek_nexts={10, 500, 1000} -async_io={0|1} -use_direct_reads=true
```
async io = 0, direct io read = true
| seek_nexts = 10, 30 runs | seek_nexts = 500, 12 runs | seek_nexts = 1000, 6 runs
-- | -- | -- | --
pre-post change | 4776 (± 28) ops/sec; 24.8 (± 0.1) MB/sec | 288 (± 1) ops/sec; 74.8 (± 0.4) MB/sec | 145 (± 4) ops/sec; 75.6 (± 2.2) MB/sec
post-change | 4790 (± 32) ops/sec; 24.9 (± 0.2) MB/sec | 288 (± 3) ops/sec; 74.7 (± 0.8) MB/sec | 143 (± 3) ops/sec; 74.5 (± 1.6) MB/sec
async io = 1, direct io read = true
| seek_nexts = 10, 54 runs | seek_nexts = 500, 6 runs | seek_nexts = 1000, 4 runs
-- | -- | -- | --
pre-post change | 3350 (± 36) ops/sec; 17.4 (± 0.2) MB/sec | 264 (± 0) ops/sec; 68.7 (± 0.2) MB/sec | 138 (± 1) ops/sec; 71.8 (± 1.0) MB/sec
post-change | 3358 (± 27) ops/sec; 17.4 (± 0.1) MB/sec | 263 (± 2) ops/sec; 68.3 (± 0.8) MB/sec | 139 (± 1) ops/sec; 72.6 (± 0.6) MB/sec
Reviewed By: ajkr
Differential Revision: D43781467
Pulled By: hx235
fbshipit-source-id: a706a18472a8edb2b952bac3af40eec803537f2a
2023-03-15 21:02:43 +00:00
|
|
|
SystemClock* clock = nullptr, Statistics* stats = nullptr,
|
2023-12-06 21:48:15 +00:00
|
|
|
const std::function<void(bool, uint64_t&, uint64_t&)>& cb = nullptr,
|
Add new stat rocksdb.table.open.prefetch.tail.read.bytes, rocksdb.table.open.prefetch.tail.{miss|hit} (#11265)
Summary:
**Context/Summary:**
We are adding new stats to measure behavior of prefetched tail size and look up into this buffer
The stat collection is done in FilePrefetchBuffer but only for prefetched tail buffer during table open for now using FilePrefetchBuffer enum. It's cleaner than the alternative of implementing in upper-level call places of FilePrefetchBuffer for table open. It also has the benefit of extensible to other types of FilePrefetchBuffer if needed. See db bench for perf regression concern.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11265
Test Plan:
**- Piggyback on existing test**
**- rocksdb.table.open.prefetch.tail.miss is harder to UT so I manually set prefetch tail read bytes to be small and run db bench.**
```
./db_bench -db=/tmp/testdb -statistics=true -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000 -write_buffer_size=655 -target_file_size_base=655 -disable_auto_compactions=false -compression_type=none -bloom_bits=3 -use_direct_reads=true
```
```
rocksdb.table.open.prefetch.tail.read.bytes P50 : 4096.000000 P95 : 4096.000000 P99 : 4096.000000 P100 : 4096.000000 COUNT : 225 SUM : 921600
rocksdb.table.open.prefetch.tail.miss COUNT : 91
rocksdb.table.open.prefetch.tail.hit COUNT : 1034
```
**- No perf regression observed in db_bench**
SETUP command: create same db with ~900 files for pre-change/post-change.
```
./db_bench -db=/tmp/testdb -benchmarks="fillseq" -key_size=32 -value_size=512 -num=500000 -write_buffer_size=655360 -disable_auto_compactions=true -target_file_size_base=16777216 -compression_type=none
```
TEST command 60 runs or til convergence: as suggested by anand1976 and akankshamahajan15, vary `seek_nexts` and `async_io` in testing.
```
./db_bench -use_existing_db=true -db=/tmp/testdb -statistics=false -cache_size=0 -cache_index_and_filter_blocks=false -benchmarks=seekrandom[-X60] -num=50000 -seek_nexts={10, 500, 1000} -async_io={0|1} -use_direct_reads=true
```
async io = 0, direct io read = true
| seek_nexts = 10, 30 runs | seek_nexts = 500, 12 runs | seek_nexts = 1000, 6 runs
-- | -- | -- | --
pre-post change | 4776 (± 28) ops/sec; 24.8 (± 0.1) MB/sec | 288 (± 1) ops/sec; 74.8 (± 0.4) MB/sec | 145 (± 4) ops/sec; 75.6 (± 2.2) MB/sec
post-change | 4790 (± 32) ops/sec; 24.9 (± 0.2) MB/sec | 288 (± 3) ops/sec; 74.7 (± 0.8) MB/sec | 143 (± 3) ops/sec; 74.5 (± 1.6) MB/sec
async io = 1, direct io read = true
| seek_nexts = 10, 54 runs | seek_nexts = 500, 6 runs | seek_nexts = 1000, 4 runs
-- | -- | -- | --
pre-post change | 3350 (± 36) ops/sec; 17.4 (± 0.2) MB/sec | 264 (± 0) ops/sec; 68.7 (± 0.2) MB/sec | 138 (± 1) ops/sec; 71.8 (± 1.0) MB/sec
post-change | 3358 (± 27) ops/sec; 17.4 (± 0.1) MB/sec | 263 (± 2) ops/sec; 68.3 (± 0.8) MB/sec | 139 (± 1) ops/sec; 72.6 (± 0.6) MB/sec
Reviewed By: ajkr
Differential Revision: D43781467
Pulled By: hx235
fbshipit-source-id: a706a18472a8edb2b952bac3af40eec803537f2a
2023-03-15 21:02:43 +00:00
|
|
|
FilePrefetchBufferUsage usage = FilePrefetchBufferUsage::kUnknown)
|
2024-01-05 17:29:01 +00:00
|
|
|
: readahead_size_(readahead_params.initial_readahead_size),
|
|
|
|
initial_auto_readahead_size_(readahead_params.initial_readahead_size),
|
|
|
|
max_readahead_size_(readahead_params.max_readahead_size),
|
2022-05-05 20:08:21 +00:00
|
|
|
min_offset_read_(std::numeric_limits<size_t>::max()),
|
2019-09-16 17:31:27 +00:00
|
|
|
enable_(enable),
|
2021-04-28 19:52:53 +00:00
|
|
|
track_min_offset_(track_min_offset),
|
2024-01-05 17:29:01 +00:00
|
|
|
implicit_auto_readahead_(readahead_params.implicit_auto_readahead),
|
2021-04-28 19:52:53 +00:00
|
|
|
prev_offset_(0),
|
|
|
|
prev_len_(0),
|
2024-01-05 17:29:01 +00:00
|
|
|
num_file_reads_for_auto_readahead_(
|
|
|
|
readahead_params.num_file_reads_for_auto_readahead),
|
|
|
|
num_file_reads_(readahead_params.num_file_reads),
|
2022-09-13 00:42:01 +00:00
|
|
|
explicit_prefetch_submitted_(false),
|
2022-04-26 04:58:22 +00:00
|
|
|
fs_(fs),
|
|
|
|
clock_(clock),
|
Add new stat rocksdb.table.open.prefetch.tail.read.bytes, rocksdb.table.open.prefetch.tail.{miss|hit} (#11265)
Summary:
**Context/Summary:**
We are adding new stats to measure behavior of prefetched tail size and look up into this buffer
The stat collection is done in FilePrefetchBuffer but only for prefetched tail buffer during table open for now using FilePrefetchBuffer enum. It's cleaner than the alternative of implementing in upper-level call places of FilePrefetchBuffer for table open. It also has the benefit of extensible to other types of FilePrefetchBuffer if needed. See db bench for perf regression concern.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11265
Test Plan:
**- Piggyback on existing test**
**- rocksdb.table.open.prefetch.tail.miss is harder to UT so I manually set prefetch tail read bytes to be small and run db bench.**
```
./db_bench -db=/tmp/testdb -statistics=true -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000 -write_buffer_size=655 -target_file_size_base=655 -disable_auto_compactions=false -compression_type=none -bloom_bits=3 -use_direct_reads=true
```
```
rocksdb.table.open.prefetch.tail.read.bytes P50 : 4096.000000 P95 : 4096.000000 P99 : 4096.000000 P100 : 4096.000000 COUNT : 225 SUM : 921600
rocksdb.table.open.prefetch.tail.miss COUNT : 91
rocksdb.table.open.prefetch.tail.hit COUNT : 1034
```
**- No perf regression observed in db_bench**
SETUP command: create same db with ~900 files for pre-change/post-change.
```
./db_bench -db=/tmp/testdb -benchmarks="fillseq" -key_size=32 -value_size=512 -num=500000 -write_buffer_size=655360 -disable_auto_compactions=true -target_file_size_base=16777216 -compression_type=none
```
TEST command 60 runs or til convergence: as suggested by anand1976 and akankshamahajan15, vary `seek_nexts` and `async_io` in testing.
```
./db_bench -use_existing_db=true -db=/tmp/testdb -statistics=false -cache_size=0 -cache_index_and_filter_blocks=false -benchmarks=seekrandom[-X60] -num=50000 -seek_nexts={10, 500, 1000} -async_io={0|1} -use_direct_reads=true
```
async io = 0, direct io read = true
| seek_nexts = 10, 30 runs | seek_nexts = 500, 12 runs | seek_nexts = 1000, 6 runs
-- | -- | -- | --
pre-post change | 4776 (± 28) ops/sec; 24.8 (± 0.1) MB/sec | 288 (± 1) ops/sec; 74.8 (± 0.4) MB/sec | 145 (± 4) ops/sec; 75.6 (± 2.2) MB/sec
post-change | 4790 (± 32) ops/sec; 24.9 (± 0.2) MB/sec | 288 (± 3) ops/sec; 74.7 (± 0.8) MB/sec | 143 (± 3) ops/sec; 74.5 (± 1.6) MB/sec
async io = 1, direct io read = true
| seek_nexts = 10, 54 runs | seek_nexts = 500, 6 runs | seek_nexts = 1000, 4 runs
-- | -- | -- | --
pre-post change | 3350 (± 36) ops/sec; 17.4 (± 0.2) MB/sec | 264 (± 0) ops/sec; 68.7 (± 0.2) MB/sec | 138 (± 1) ops/sec; 71.8 (± 1.0) MB/sec
post-change | 3358 (± 27) ops/sec; 17.4 (± 0.1) MB/sec | 263 (± 2) ops/sec; 68.3 (± 0.8) MB/sec | 139 (± 1) ops/sec; 72.6 (± 0.6) MB/sec
Reviewed By: ajkr
Differential Revision: D43781467
Pulled By: hx235
fbshipit-source-id: a706a18472a8edb2b952bac3af40eec803537f2a
2023-03-15 21:02:43 +00:00
|
|
|
stats_(stats),
|
2023-08-18 22:52:04 +00:00
|
|
|
usage_(usage),
|
2024-01-05 17:29:01 +00:00
|
|
|
readaheadsize_cb_(cb),
|
|
|
|
num_buffers_(readahead_params.num_buffers) {
|
2022-09-01 18:56:00 +00:00
|
|
|
assert((num_file_reads_ >= num_file_reads_for_auto_readahead_ + 1) ||
|
2022-06-16 03:17:35 +00:00
|
|
|
(num_file_reads_ == 0));
|
2024-01-05 17:29:01 +00:00
|
|
|
|
|
|
|
// If num_buffers_ > 1, data is asynchronously filled in the
|
|
|
|
// queue. As result, data can be overlapping in two buffers. It copies the
|
|
|
|
// data to overlap_buf_ in order to to return continuous buffer.
|
|
|
|
if (num_buffers_ > 1) {
|
|
|
|
overlap_buf_ = new BufferInfo();
|
|
|
|
}
|
|
|
|
|
|
|
|
free_bufs_.resize(num_buffers_);
|
|
|
|
for (uint32_t i = 0; i < num_buffers_; i++) {
|
|
|
|
free_bufs_[i] = new BufferInfo();
|
2022-09-13 00:42:01 +00:00
|
|
|
}
|
2022-03-21 14:12:43 +00:00
|
|
|
}
|
2019-09-16 17:31:27 +00:00
|
|
|
|
2022-04-04 22:35:43 +00:00
|
|
|
~FilePrefetchBuffer() {
|
2022-04-26 04:58:22 +00:00
|
|
|
// Abort any pending async read request before destroying the class object.
|
2022-09-13 00:42:01 +00:00
|
|
|
if (fs_ != nullptr) {
|
2022-04-04 22:35:43 +00:00
|
|
|
std::vector<void*> handles;
|
2024-01-05 17:29:01 +00:00
|
|
|
for (auto& buf : bufs_) {
|
|
|
|
if (buf->async_read_in_progress_ && buf->io_handle_ != nullptr) {
|
|
|
|
handles.emplace_back(buf->io_handle_);
|
2022-09-13 00:42:01 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
if (!handles.empty()) {
|
|
|
|
StopWatch sw(clock_, stats_, ASYNC_PREFETCH_ABORT_MICROS);
|
|
|
|
Status s = fs_->AbortIO(handles);
|
|
|
|
assert(s.ok());
|
|
|
|
}
|
2024-01-05 17:29:01 +00:00
|
|
|
|
|
|
|
for (auto& buf : bufs_) {
|
|
|
|
if (buf->io_handle_ != nullptr) {
|
|
|
|
DestroyAndClearIOHandle(buf);
|
|
|
|
buf->ClearBuffer();
|
|
|
|
}
|
|
|
|
buf->async_read_in_progress_ = false;
|
|
|
|
}
|
2022-04-04 22:35:43 +00:00
|
|
|
}
|
2022-04-26 04:58:22 +00:00
|
|
|
|
|
|
|
// Prefetch buffer bytes discarded.
|
|
|
|
uint64_t bytes_discarded = 0;
|
2024-01-05 17:29:01 +00:00
|
|
|
// Iterated over buffers.
|
|
|
|
for (auto& buf : bufs_) {
|
|
|
|
if (buf->DoesBufferContainData()) {
|
|
|
|
// If last read was from this block and some bytes are still unconsumed.
|
|
|
|
if (prev_offset_ >= buf->offset_ &&
|
|
|
|
prev_offset_ + prev_len_ < buf->offset_ + buf->CurrentSize()) {
|
|
|
|
bytes_discarded +=
|
|
|
|
buf->CurrentSize() - (prev_offset_ + prev_len_ - buf->offset_);
|
2022-08-29 21:37:44 +00:00
|
|
|
}
|
2024-01-05 17:29:01 +00:00
|
|
|
// If last read was from previous blocks and this block is unconsumed.
|
|
|
|
else if (prev_offset_ < buf->offset_ &&
|
|
|
|
prev_offset_ + prev_len_ <= buf->offset_) {
|
|
|
|
bytes_discarded += buf->CurrentSize();
|
2022-08-29 21:37:44 +00:00
|
|
|
}
|
|
|
|
}
|
2022-04-26 04:58:22 +00:00
|
|
|
}
|
|
|
|
|
2022-09-13 00:42:01 +00:00
|
|
|
RecordInHistogram(stats_, PREFETCHED_BYTES_DISCARDED, bytes_discarded);
|
2024-01-05 17:29:01 +00:00
|
|
|
|
|
|
|
for (auto& buf : bufs_) {
|
|
|
|
delete buf;
|
|
|
|
buf = nullptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (auto& buf : free_bufs_) {
|
|
|
|
delete buf;
|
|
|
|
buf = nullptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (overlap_buf_ != nullptr) {
|
|
|
|
delete overlap_buf_;
|
|
|
|
overlap_buf_ = nullptr;
|
|
|
|
}
|
2022-04-04 22:35:43 +00:00
|
|
|
}
|
|
|
|
|
Record and use the tail size to prefetch table tail (#11406)
Summary:
**Context:**
We prefetch the tail part of a SST file (i.e, the blocks after data blocks till the end of the file) during each SST file open in hope to prefetch all the stuff at once ahead of time for later read e.g, footer, meta index, filter/index etc. The existing approach to estimate the tail size to prefetch is through `TailPrefetchStats` heuristics introduced in https://github.com/facebook/rocksdb/pull/4156, which has caused small reads in unlucky case (e.g, small read into the tail buffer during table open in thread 1 under the same BlockBasedTableFactory object can make thread 2's tail prefetching use a small size that it shouldn't) and is hard to debug. Therefore we decide to record the exact tail size and use it directly to prefetch tail of the SST instead of relying heuristics.
**Summary:**
- Obtain and record in manifest the tail size in `BlockBasedTableBuilder::Finish()`
- For backward compatibility, we fall back to TailPrefetchStats and last to simple heuristics that the tail size is a linear portion of the file size - see PR conversation for more.
- Make`tail_start_offset` part of the table properties and deduct tail size to record in manifest for external files (e.g, file ingestion, import CF) and db repair (with no access to manifest).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11406
Test Plan:
1. New UT
2. db bench
Note: db bench on /tmp/ where direct read is supported is too slow to finish and the default pinning setting in db bench is not helpful to profile # sst read of Get. Therefore I hacked the following to obtain the following comparison.
```
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index bd5669f0f..791484c1f 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -838,7 +838,7 @@ Status BlockBasedTable::PrefetchTail(
&tail_prefetch_size);
// Try file system prefetch
- if (!file->use_direct_io() && !force_direct_prefetch) {
+ if (false && !file->use_direct_io() && !force_direct_prefetch) {
if (!file->Prefetch(prefetch_off, prefetch_len, ro.rate_limiter_priority)
.IsNotSupported()) {
prefetch_buffer->reset(new FilePrefetchBuffer(
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index ea40f5fa0..39a0ac385 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -4191,6 +4191,8 @@ class Benchmark {
std::shared_ptr<TableFactory>(NewCuckooTableFactory(table_options));
} else {
BlockBasedTableOptions block_based_options;
+ block_based_options.metadata_cache_options.partition_pinning =
+ PinningTier::kAll;
block_based_options.checksum =
static_cast<ChecksumType>(FLAGS_checksum_type);
if (FLAGS_use_hash_search) {
```
Create DB
```
./db_bench --bloom_bits=3 --use_existing_db=1 --seed=1682546046158958 --partition_index_and_filters=1 --statistics=1 -db=/dev/shm/testdb/ -benchmarks=readrandom -key_size=3200 -value_size=512 -num=1000000 -write_buffer_size=6550000 -disable_auto_compactions=false -target_file_size_base=6550000 -compression_type=none
```
ReadRandom
```
./db_bench --bloom_bits=3 --use_existing_db=1 --seed=1682546046158958 --partition_index_and_filters=1 --statistics=1 -db=/dev/shm/testdb/ -benchmarks=readrandom -key_size=3200 -value_size=512 -num=1000000 -write_buffer_size=6550000 -disable_auto_compactions=false -target_file_size_base=6550000 -compression_type=none
```
(a) Existing (Use TailPrefetchStats for tail size + use seperate prefetch buffer in PartitionedFilter/IndexReader::CacheDependencies())
```
rocksdb.table.open.prefetch.tail.hit COUNT : 3395
rocksdb.sst.read.micros P50 : 5.655570 P95 : 9.931396 P99 : 14.845454 P100 : 585.000000 COUNT : 999905 SUM : 6590614
```
(b) This PR (Record tail size + use the same tail buffer in PartitionedFilter/IndexReader::CacheDependencies())
```
rocksdb.table.open.prefetch.tail.hit COUNT : 14257
rocksdb.sst.read.micros P50 : 5.173347 P95 : 9.015017 P99 : 12.912610 P100 : 228.000000 COUNT : 998547 SUM : 5976540
```
As we can see, we increase the prefetch tail hit count and decrease SST read count with this PR
3. Test backward compatibility by stepping through reading with post-PR code on a db generated pre-PR.
Reviewed By: pdillinger
Differential Revision: D45413346
Pulled By: hx235
fbshipit-source-id: 7d5e36a60a72477218f79905168d688452a4c064
2023-05-08 20:14:28 +00:00
|
|
|
bool Enabled() const { return enable_; }
|
|
|
|
|
2024-01-05 17:29:01 +00:00
|
|
|
// Called externally by user to only load data into the buffer from a file
|
|
|
|
// with num_buffers_ should be set to default(1).
|
|
|
|
//
|
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444)
Summary:
**Context/Summary:**
- Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`.
- For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation)
- New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main.
- Misc
- More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority`
- Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444
Test Plan:
- CI fake db crash/stress test
- Microbenchmarking
**Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench`
- google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd
- db_basic_bench_base: upstream
- db_basic_bench_pr: db_basic_bench_base + this PR
- asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read)
- asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR
**Test**
Get
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000
```
Result
```
Coming soon
```
AsyncRead
```
TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out
```
Result
```
Base:
1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008,
PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before):
1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053,
```
Reviewed By: ajkr
Differential Revision: D45918925
Pulled By: hx235
fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
|
|
|
// opts : the IO options to use.
|
2022-02-17 07:17:03 +00:00
|
|
|
// reader : the file reader.
|
|
|
|
// offset : the file offset to start reading from.
|
|
|
|
// n : the number of bytes to read.
|
2024-01-05 17:29:01 +00:00
|
|
|
//
|
2020-06-29 21:51:57 +00:00
|
|
|
Status Prefetch(const IOOptions& opts, RandomAccessFileReader* reader,
|
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444)
Summary:
**Context/Summary:**
- Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`.
- For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation)
- New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main.
- Misc
- More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority`
- Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444
Test Plan:
- CI fake db crash/stress test
- Microbenchmarking
**Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench`
- google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd
- db_basic_bench_base: upstream
- db_basic_bench_pr: db_basic_bench_base + this PR
- asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read)
- asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR
**Test**
Get
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000
```
Result
```
Coming soon
```
AsyncRead
```
TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out
```
Result
```
Base:
1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008,
PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before):
1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053,
```
Reviewed By: ajkr
Differential Revision: D45918925
Pulled By: hx235
fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
|
|
|
uint64_t offset, size_t n);
|
2019-09-16 17:31:27 +00:00
|
|
|
|
2022-05-20 23:09:33 +00:00
|
|
|
// Request for reading the data from a file asynchronously.
|
|
|
|
// If data already exists in the buffer, result will be updated.
|
|
|
|
// reader : the file reader.
|
|
|
|
// offset : the file offset to start reading from.
|
|
|
|
// n : the number of bytes to read.
|
|
|
|
// result : if data already exists in the buffer, result will
|
|
|
|
// be updated with the data.
|
|
|
|
//
|
|
|
|
// If data already exist in the buffer, it will return Status::OK, otherwise
|
|
|
|
// it will send asynchronous request and return Status::TryAgain.
|
2022-03-21 14:12:43 +00:00
|
|
|
Status PrefetchAsync(const IOOptions& opts, RandomAccessFileReader* reader,
|
2022-07-06 18:42:59 +00:00
|
|
|
uint64_t offset, size_t n, Slice* result);
|
2022-03-21 14:12:43 +00:00
|
|
|
|
2021-11-20 01:52:42 +00:00
|
|
|
// Tries returning the data for a file read from this buffer if that data is
|
2019-09-16 17:31:27 +00:00
|
|
|
// in the buffer.
|
|
|
|
// It handles tracking the minimum read offset if track_min_offset = true.
|
2021-02-18 22:29:36 +00:00
|
|
|
// It also does the exponential readahead when readahead_size is set as part
|
2019-09-16 17:31:27 +00:00
|
|
|
// of the constructor.
|
|
|
|
//
|
2022-02-17 07:17:03 +00:00
|
|
|
// opts : the IO options to use.
|
|
|
|
// reader : the file reader.
|
|
|
|
// offset : the file offset.
|
|
|
|
// n : the number of bytes.
|
|
|
|
// result : output buffer to put the data into.
|
|
|
|
// s : output status.
|
|
|
|
// for_compaction : true if cache read is done for compaction read.
|
2021-11-20 01:52:42 +00:00
|
|
|
bool TryReadFromCache(const IOOptions& opts, RandomAccessFileReader* reader,
|
|
|
|
uint64_t offset, size_t n, Slice* result, Status* s,
|
|
|
|
bool for_compaction = false);
|
2019-09-16 17:31:27 +00:00
|
|
|
|
|
|
|
// The minimum `offset` ever passed to TryReadFromCache(). This will nly be
|
|
|
|
// tracked if track_min_offset = true.
|
|
|
|
size_t min_offset_read() const { return min_offset_read_; }
|
|
|
|
|
2024-01-05 17:29:01 +00:00
|
|
|
size_t GetPrefetchOffset() const { return bufs_.front()->offset_; }
|
2023-06-07 00:42:43 +00:00
|
|
|
|
2021-12-01 06:52:14 +00:00
|
|
|
// Called in case of implicit auto prefetching.
|
2021-11-11 00:18:27 +00:00
|
|
|
void UpdateReadPattern(const uint64_t& offset, const size_t& len,
|
2022-03-21 14:12:43 +00:00
|
|
|
bool decrease_readaheadsize) {
|
|
|
|
if (decrease_readaheadsize) {
|
2021-11-11 00:18:27 +00:00
|
|
|
DecreaseReadAheadIfEligible(offset, len);
|
|
|
|
}
|
2021-04-28 19:52:53 +00:00
|
|
|
prev_offset_ = offset;
|
|
|
|
prev_len_ = len;
|
Fix db_stress failure in async_io in FilePrefetchBuffer (#10949)
Summary:
Fix db_stress failure in async_io in FilePrefetchBuffer.
From the logs, assertion was caused when
- prev_offset_ = offset but somehow prev_len != 0 and explicit_prefetch_submitted_ = true. That scenario is when we send async request to prefetch buffer during seek but in second seek that data is found in cache. prev_offset_ and prev_len_ get updated but we were not setting explicit_prefetch_submitted_ = false because of which buffers were getting out of sync.
It's possible a read by another thread might have loaded the block into the cache in the meantime.
Particular assertion example:
```
prev_offset: 0, prev_len_: 8097 , offset: 0, length: 8097, actual_length: 8097 , actual_offset: 0 ,
curr_: 0, bufs_[curr_].offset_: 4096 ,bufs_[curr_].CurrentSize(): 48541 , async_len_to_read: 278528, bufs_[curr_].async_in_progress_: false
second: 1, bufs_[second].offset_: 282624 ,bufs_[second].CurrentSize(): 0, async_len_to_read: 262144 ,bufs_[second].async_in_progress_: true ,
explicit_prefetch_submitted_: true , copy_to_third_buffer: false
```
As we can see curr_ was expected to read 278528 but it read 48541. Also buffers are out of sync.
Also `explicit_prefetch_submitted_` is set true but prev_len not 0.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10949
Test Plan:
- Ran db_bench for regression to make sure there is no regression;
- Ran db_stress failing without this fix,
- Ran build-linux-mini-crashtest 7- 8 times locally + CircleCI
Reviewed By: anand1976
Differential Revision: D41257786
Pulled By: akankshamahajan15
fbshipit-source-id: 1d100f94f8c06bbbe4cc76ca27f1bbc820c2494f
2022-11-15 00:14:41 +00:00
|
|
|
explicit_prefetch_submitted_ = false;
|
2021-04-28 19:52:53 +00:00
|
|
|
}
|
|
|
|
|
2021-11-11 00:18:27 +00:00
|
|
|
void GetReadaheadState(ReadaheadFileInfo::ReadaheadInfo* readahead_info) {
|
|
|
|
readahead_info->readahead_size = readahead_size_;
|
|
|
|
readahead_info->num_file_reads = num_file_reads_;
|
|
|
|
}
|
|
|
|
|
|
|
|
void DecreaseReadAheadIfEligible(uint64_t offset, size_t size,
|
2022-09-13 00:42:01 +00:00
|
|
|
size_t value = DEFAULT_DECREMENT) {
|
2024-01-05 17:29:01 +00:00
|
|
|
if (bufs_.empty()) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2021-11-11 00:18:27 +00:00
|
|
|
// Decrease the readahead_size if
|
|
|
|
// - its enabled internally by RocksDB (implicit_auto_readahead_) and,
|
|
|
|
// - readahead_size is greater than 0 and,
|
|
|
|
// - this block would have called prefetch API if not found in cache for
|
|
|
|
// which conditions are:
|
|
|
|
// - few/no bytes are in buffer and,
|
|
|
|
// - block is sequential with the previous read and,
|
|
|
|
// - num_file_reads_ + 1 (including this read) >
|
2022-09-01 18:56:00 +00:00
|
|
|
// num_file_reads_for_auto_readahead_
|
2024-01-05 17:29:01 +00:00
|
|
|
|
|
|
|
size_t curr_size = bufs_.front()->async_read_in_progress_
|
|
|
|
? bufs_.front()->async_req_len_
|
|
|
|
: bufs_.front()->CurrentSize();
|
2021-11-11 00:18:27 +00:00
|
|
|
if (implicit_auto_readahead_ && readahead_size_ > 0) {
|
2024-01-05 17:29:01 +00:00
|
|
|
if ((offset + size > bufs_.front()->offset_ + curr_size) &&
|
2021-11-11 00:18:27 +00:00
|
|
|
IsBlockSequential(offset) &&
|
2022-09-01 18:56:00 +00:00
|
|
|
(num_file_reads_ + 1 > num_file_reads_for_auto_readahead_)) {
|
2021-11-11 00:18:27 +00:00
|
|
|
readahead_size_ =
|
2022-04-16 00:28:09 +00:00
|
|
|
std::max(initial_auto_readahead_size_,
|
2021-11-11 00:18:27 +00:00
|
|
|
(readahead_size_ >= value ? readahead_size_ - value : 0));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-03-21 14:12:43 +00:00
|
|
|
// Callback function passed to underlying FS in case of asynchronous reads.
|
2024-02-16 17:14:55 +00:00
|
|
|
void PrefetchAsyncCallback(FSReadRequest& req, void* cb_arg);
|
2022-03-21 14:12:43 +00:00
|
|
|
|
2024-01-05 17:29:01 +00:00
|
|
|
void TEST_GetBufferOffsetandSize(
|
|
|
|
std::vector<std::pair<uint64_t, size_t>>& buffer_info) {
|
|
|
|
for (size_t i = 0; i < bufs_.size(); i++) {
|
|
|
|
buffer_info[i].first = bufs_[i]->offset_;
|
|
|
|
buffer_info[i].second = bufs_[i]->async_read_in_progress_
|
|
|
|
? bufs_[i]->async_req_len_
|
|
|
|
: bufs_[i]->CurrentSize();
|
|
|
|
}
|
2023-12-06 21:48:15 +00:00
|
|
|
}
|
|
|
|
|
2019-09-16 17:31:27 +00:00
|
|
|
private:
|
2022-03-21 14:12:43 +00:00
|
|
|
// Calculates roundoff offset and length to be prefetched based on alignment
|
|
|
|
// and data present in buffer_. It also allocates new buffer or refit tail if
|
|
|
|
// required.
|
2024-01-05 17:29:01 +00:00
|
|
|
void PrepareBufferForRead(BufferInfo* buf, size_t alignment, uint64_t offset,
|
|
|
|
size_t roundup_len, bool refit_tail,
|
|
|
|
uint64_t& aligned_useful_len);
|
2022-09-13 00:42:01 +00:00
|
|
|
|
2024-01-05 17:29:01 +00:00
|
|
|
void AbortOutdatedIO(uint64_t offset);
|
2022-09-13 00:42:01 +00:00
|
|
|
|
|
|
|
void AbortAllIOs();
|
|
|
|
|
2024-01-05 17:29:01 +00:00
|
|
|
void ClearOutdatedData(uint64_t offset, size_t len);
|
2022-03-21 14:12:43 +00:00
|
|
|
|
2024-01-05 17:29:01 +00:00
|
|
|
// It calls Poll API to check for any pending asynchronous request.
|
|
|
|
void PollIfNeeded(uint64_t offset, size_t len);
|
2022-05-20 23:09:33 +00:00
|
|
|
|
2024-01-05 17:29:01 +00:00
|
|
|
Status PrefetchInternal(const IOOptions& opts, RandomAccessFileReader* reader,
|
|
|
|
uint64_t offset, size_t length, size_t readahead_size,
|
|
|
|
bool& copy_to_third_buffer);
|
2022-05-20 23:09:33 +00:00
|
|
|
|
2024-01-05 17:29:01 +00:00
|
|
|
Status Read(BufferInfo* buf, const IOOptions& opts,
|
|
|
|
RandomAccessFileReader* reader, uint64_t read_len,
|
|
|
|
uint64_t aligned_useful_len, uint64_t start_offset);
|
2022-03-21 14:12:43 +00:00
|
|
|
|
2024-01-05 17:29:01 +00:00
|
|
|
Status ReadAsync(BufferInfo* buf, const IOOptions& opts,
|
|
|
|
RandomAccessFileReader* reader, uint64_t read_len,
|
|
|
|
uint64_t start_offset);
|
2022-03-21 14:12:43 +00:00
|
|
|
|
2024-01-05 17:29:01 +00:00
|
|
|
// Copy the data from src to overlap_buf_.
|
|
|
|
void CopyDataToBuffer(BufferInfo* src, uint64_t& offset, size_t& length);
|
2022-03-21 14:12:43 +00:00
|
|
|
|
|
|
|
bool IsBlockSequential(const size_t& offset) {
|
|
|
|
return (prev_len_ == 0 || (prev_offset_ + prev_len_ == offset));
|
|
|
|
}
|
|
|
|
|
|
|
|
// Called in case of implicit auto prefetching.
|
|
|
|
void ResetValues() {
|
|
|
|
num_file_reads_ = 1;
|
2022-04-16 00:28:09 +00:00
|
|
|
readahead_size_ = initial_auto_readahead_size_;
|
2022-03-21 14:12:43 +00:00
|
|
|
}
|
|
|
|
|
2022-06-16 03:17:35 +00:00
|
|
|
// Called in case of implicit auto prefetching.
|
2022-05-20 23:09:33 +00:00
|
|
|
bool IsEligibleForPrefetch(uint64_t offset, size_t n) {
|
|
|
|
// Prefetch only if this read is sequential otherwise reset readahead_size_
|
|
|
|
// to initial value.
|
|
|
|
if (!IsBlockSequential(offset)) {
|
|
|
|
UpdateReadPattern(offset, n, false /*decrease_readaheadsize*/);
|
|
|
|
ResetValues();
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
num_file_reads_++;
|
2022-06-16 03:17:35 +00:00
|
|
|
|
|
|
|
// Since async request was submitted in last call directly by calling
|
|
|
|
// PrefetchAsync, it skips num_file_reads_ check as this call is to poll the
|
|
|
|
// data submitted in previous call.
|
2022-09-13 00:42:01 +00:00
|
|
|
if (explicit_prefetch_submitted_) {
|
2022-06-16 03:17:35 +00:00
|
|
|
return true;
|
|
|
|
}
|
2022-09-01 18:56:00 +00:00
|
|
|
if (num_file_reads_ <= num_file_reads_for_auto_readahead_) {
|
2022-05-20 23:09:33 +00:00
|
|
|
UpdateReadPattern(offset, n, false /*decrease_readaheadsize*/);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2024-01-05 17:29:01 +00:00
|
|
|
bool IsEligibleForFurtherPrefetching() {
|
|
|
|
if (free_bufs_.empty()) {
|
2022-11-01 23:06:51 +00:00
|
|
|
return false;
|
|
|
|
}
|
2023-08-29 00:08:28 +00:00
|
|
|
// Readahead size can be 0 because of trimming.
|
|
|
|
if (readahead_size_ == 0) {
|
|
|
|
return false;
|
|
|
|
}
|
2022-11-01 23:06:51 +00:00
|
|
|
return true;
|
|
|
|
}
|
2022-09-13 00:42:01 +00:00
|
|
|
|
2024-01-05 17:29:01 +00:00
|
|
|
void DestroyAndClearIOHandle(BufferInfo* buf) {
|
|
|
|
if (buf->io_handle_ != nullptr && buf->del_fn_ != nullptr) {
|
|
|
|
buf->del_fn_(buf->io_handle_);
|
|
|
|
buf->io_handle_ = nullptr;
|
|
|
|
buf->del_fn_ = nullptr;
|
2022-09-13 00:42:01 +00:00
|
|
|
}
|
2024-01-05 17:29:01 +00:00
|
|
|
buf->async_read_in_progress_ = false;
|
2022-09-13 00:42:01 +00:00
|
|
|
}
|
|
|
|
|
2022-11-01 23:06:51 +00:00
|
|
|
Status HandleOverlappingData(const IOOptions& opts,
|
|
|
|
RandomAccessFileReader* reader, uint64_t offset,
|
|
|
|
size_t length, size_t readahead_size,
|
|
|
|
bool& copy_to_third_buffer, uint64_t& tmp_offset,
|
|
|
|
size_t& tmp_length);
|
|
|
|
|
Add new stat rocksdb.table.open.prefetch.tail.read.bytes, rocksdb.table.open.prefetch.tail.{miss|hit} (#11265)
Summary:
**Context/Summary:**
We are adding new stats to measure behavior of prefetched tail size and look up into this buffer
The stat collection is done in FilePrefetchBuffer but only for prefetched tail buffer during table open for now using FilePrefetchBuffer enum. It's cleaner than the alternative of implementing in upper-level call places of FilePrefetchBuffer for table open. It also has the benefit of extensible to other types of FilePrefetchBuffer if needed. See db bench for perf regression concern.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11265
Test Plan:
**- Piggyback on existing test**
**- rocksdb.table.open.prefetch.tail.miss is harder to UT so I manually set prefetch tail read bytes to be small and run db bench.**
```
./db_bench -db=/tmp/testdb -statistics=true -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000 -write_buffer_size=655 -target_file_size_base=655 -disable_auto_compactions=false -compression_type=none -bloom_bits=3 -use_direct_reads=true
```
```
rocksdb.table.open.prefetch.tail.read.bytes P50 : 4096.000000 P95 : 4096.000000 P99 : 4096.000000 P100 : 4096.000000 COUNT : 225 SUM : 921600
rocksdb.table.open.prefetch.tail.miss COUNT : 91
rocksdb.table.open.prefetch.tail.hit COUNT : 1034
```
**- No perf regression observed in db_bench**
SETUP command: create same db with ~900 files for pre-change/post-change.
```
./db_bench -db=/tmp/testdb -benchmarks="fillseq" -key_size=32 -value_size=512 -num=500000 -write_buffer_size=655360 -disable_auto_compactions=true -target_file_size_base=16777216 -compression_type=none
```
TEST command 60 runs or til convergence: as suggested by anand1976 and akankshamahajan15, vary `seek_nexts` and `async_io` in testing.
```
./db_bench -use_existing_db=true -db=/tmp/testdb -statistics=false -cache_size=0 -cache_index_and_filter_blocks=false -benchmarks=seekrandom[-X60] -num=50000 -seek_nexts={10, 500, 1000} -async_io={0|1} -use_direct_reads=true
```
async io = 0, direct io read = true
| seek_nexts = 10, 30 runs | seek_nexts = 500, 12 runs | seek_nexts = 1000, 6 runs
-- | -- | -- | --
pre-post change | 4776 (± 28) ops/sec; 24.8 (± 0.1) MB/sec | 288 (± 1) ops/sec; 74.8 (± 0.4) MB/sec | 145 (± 4) ops/sec; 75.6 (± 2.2) MB/sec
post-change | 4790 (± 32) ops/sec; 24.9 (± 0.2) MB/sec | 288 (± 3) ops/sec; 74.7 (± 0.8) MB/sec | 143 (± 3) ops/sec; 74.5 (± 1.6) MB/sec
async io = 1, direct io read = true
| seek_nexts = 10, 54 runs | seek_nexts = 500, 6 runs | seek_nexts = 1000, 4 runs
-- | -- | -- | --
pre-post change | 3350 (± 36) ops/sec; 17.4 (± 0.2) MB/sec | 264 (± 0) ops/sec; 68.7 (± 0.2) MB/sec | 138 (± 1) ops/sec; 71.8 (± 1.0) MB/sec
post-change | 3358 (± 27) ops/sec; 17.4 (± 0.1) MB/sec | 263 (± 2) ops/sec; 68.3 (± 0.8) MB/sec | 139 (± 1) ops/sec; 72.6 (± 0.6) MB/sec
Reviewed By: ajkr
Differential Revision: D43781467
Pulled By: hx235
fbshipit-source-id: a706a18472a8edb2b952bac3af40eec803537f2a
2023-03-15 21:02:43 +00:00
|
|
|
bool TryReadFromCacheUntracked(const IOOptions& opts,
|
|
|
|
RandomAccessFileReader* reader,
|
|
|
|
uint64_t offset, size_t n, Slice* result,
|
|
|
|
Status* s,
|
|
|
|
bool for_compaction = false);
|
|
|
|
|
2024-01-05 17:29:01 +00:00
|
|
|
void ReadAheadSizeTuning(BufferInfo* buf, bool read_curr_block,
|
|
|
|
bool refit_tail, uint64_t prev_buf_end_offset,
|
2023-12-06 21:48:15 +00:00
|
|
|
size_t alignment, size_t length,
|
|
|
|
size_t readahead_size, uint64_t& offset,
|
|
|
|
uint64_t& end_offset, size_t& read_len,
|
2024-01-05 17:29:01 +00:00
|
|
|
uint64_t& aligned_useful_len);
|
2023-09-23 01:12:08 +00:00
|
|
|
|
2023-12-06 21:48:15 +00:00
|
|
|
void UpdateStats(bool found_in_buffer, size_t length_found) {
|
|
|
|
if (found_in_buffer) {
|
|
|
|
RecordTick(stats_, PREFETCH_HITS);
|
|
|
|
}
|
|
|
|
if (length_found > 0) {
|
|
|
|
RecordTick(stats_, PREFETCH_BYTES_USEFUL, length_found);
|
2023-09-23 01:12:08 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-12-14 19:25:51 +00:00
|
|
|
void UpdateReadAheadTrimmedStat(size_t initial_length,
|
|
|
|
size_t updated_length) {
|
|
|
|
if (initial_length != updated_length) {
|
|
|
|
RecordTick(stats_, READAHEAD_TRIMMED);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-01-05 17:29:01 +00:00
|
|
|
Status PrefetchRemBuffers(const IOOptions& opts,
|
|
|
|
RandomAccessFileReader* reader,
|
|
|
|
uint64_t end_offset1, size_t alignment,
|
|
|
|
size_t readahead_size);
|
|
|
|
|
|
|
|
// *** BEGIN APIs related to allocating and freeing buffers ***
|
|
|
|
bool IsBufferQueueEmpty() { return bufs_.empty(); }
|
|
|
|
|
|
|
|
BufferInfo* GetFirstBuffer() { return bufs_.front(); }
|
|
|
|
|
|
|
|
BufferInfo* GetLastBuffer() { return bufs_.back(); }
|
|
|
|
|
|
|
|
size_t NumBuffersAllocated() { return bufs_.size(); }
|
|
|
|
|
|
|
|
void AllocateBuffer() {
|
|
|
|
assert(!free_bufs_.empty());
|
|
|
|
BufferInfo* buf = free_bufs_.front();
|
|
|
|
free_bufs_.pop_front();
|
|
|
|
bufs_.emplace_back(buf);
|
|
|
|
}
|
|
|
|
|
|
|
|
void AllocateBufferIfEmpty() {
|
|
|
|
if (bufs_.empty()) {
|
|
|
|
AllocateBuffer();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void FreeFrontBuffer() {
|
|
|
|
BufferInfo* buf = bufs_.front();
|
|
|
|
buf->ClearBuffer();
|
|
|
|
bufs_.pop_front();
|
|
|
|
free_bufs_.emplace_back(buf);
|
|
|
|
}
|
|
|
|
|
|
|
|
void FreeLastBuffer() {
|
|
|
|
BufferInfo* buf = bufs_.back();
|
|
|
|
buf->ClearBuffer();
|
|
|
|
bufs_.pop_back();
|
|
|
|
free_bufs_.emplace_back(buf);
|
|
|
|
}
|
|
|
|
|
|
|
|
void FreeAllBuffers() {
|
2024-01-06 02:10:58 +00:00
|
|
|
while (!bufs_.empty()) {
|
|
|
|
BufferInfo* buf = bufs_.front();
|
2024-01-05 17:29:01 +00:00
|
|
|
buf->ClearBuffer();
|
|
|
|
bufs_.pop_front();
|
|
|
|
free_bufs_.emplace_back(buf);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void FreeEmptyBuffers() {
|
|
|
|
if (bufs_.empty()) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::deque<BufferInfo*> tmp_buf;
|
|
|
|
while (!bufs_.empty()) {
|
|
|
|
BufferInfo* buf = bufs_.front();
|
|
|
|
bufs_.pop_front();
|
|
|
|
if (buf->async_read_in_progress_ || buf->DoesBufferContainData()) {
|
|
|
|
tmp_buf.emplace_back(buf);
|
|
|
|
} else {
|
|
|
|
free_bufs_.emplace_back(buf);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
bufs_ = tmp_buf;
|
|
|
|
}
|
|
|
|
|
|
|
|
// *** END APIs related to allocating and freeing buffers ***
|
|
|
|
|
|
|
|
std::deque<BufferInfo*> bufs_;
|
|
|
|
std::deque<BufferInfo*> free_bufs_;
|
|
|
|
BufferInfo* overlap_buf_ = nullptr;
|
2022-09-13 00:42:01 +00:00
|
|
|
|
2019-09-16 17:31:27 +00:00
|
|
|
size_t readahead_size_;
|
2022-04-16 00:28:09 +00:00
|
|
|
size_t initial_auto_readahead_size_;
|
2021-11-11 00:18:27 +00:00
|
|
|
// FilePrefetchBuffer object won't be created from Iterator flow if
|
|
|
|
// max_readahead_size_ = 0.
|
2019-09-16 17:31:27 +00:00
|
|
|
size_t max_readahead_size_;
|
2022-09-13 00:42:01 +00:00
|
|
|
|
2019-09-16 17:31:27 +00:00
|
|
|
// The minimum `offset` ever passed to TryReadFromCache().
|
|
|
|
size_t min_offset_read_;
|
|
|
|
// if false, TryReadFromCache() always return false, and we only take stats
|
|
|
|
// for track_min_offset_ if track_min_offset_ = true
|
|
|
|
bool enable_;
|
|
|
|
// If true, track minimum `offset` ever passed to TryReadFromCache(), which
|
|
|
|
// can be fetched from min_offset_read().
|
|
|
|
bool track_min_offset_;
|
2021-04-28 19:52:53 +00:00
|
|
|
|
2021-11-11 00:18:27 +00:00
|
|
|
// implicit_auto_readahead is enabled by rocksdb internally after 2
|
|
|
|
// sequential IOs.
|
2021-04-28 19:52:53 +00:00
|
|
|
bool implicit_auto_readahead_;
|
2021-11-11 00:18:27 +00:00
|
|
|
uint64_t prev_offset_;
|
2021-04-28 19:52:53 +00:00
|
|
|
size_t prev_len_;
|
2022-09-01 18:56:00 +00:00
|
|
|
// num_file_reads_ and num_file_reads_for_auto_readahead_ is only used when
|
|
|
|
// implicit_auto_readahead_ is set.
|
|
|
|
uint64_t num_file_reads_for_auto_readahead_;
|
2022-06-16 03:17:35 +00:00
|
|
|
uint64_t num_file_reads_;
|
2022-03-21 14:12:43 +00:00
|
|
|
|
2022-09-13 00:42:01 +00:00
|
|
|
// If explicit_prefetch_submitted_ is set then it indicates RocksDB called
|
2024-01-05 17:29:01 +00:00
|
|
|
// PrefetchAsync to submit request. It needs to call TryReadFromCache to
|
2022-09-13 00:42:01 +00:00
|
|
|
// poll the submitted request without checking if data is sequential and
|
2022-06-16 03:17:35 +00:00
|
|
|
// num_file_reads_.
|
2022-09-13 00:42:01 +00:00
|
|
|
bool explicit_prefetch_submitted_;
|
2022-06-16 03:17:35 +00:00
|
|
|
|
2022-04-04 22:35:43 +00:00
|
|
|
FileSystem* fs_;
|
2022-04-26 04:58:22 +00:00
|
|
|
SystemClock* clock_;
|
|
|
|
Statistics* stats_;
|
Add new stat rocksdb.table.open.prefetch.tail.read.bytes, rocksdb.table.open.prefetch.tail.{miss|hit} (#11265)
Summary:
**Context/Summary:**
We are adding new stats to measure behavior of prefetched tail size and look up into this buffer
The stat collection is done in FilePrefetchBuffer but only for prefetched tail buffer during table open for now using FilePrefetchBuffer enum. It's cleaner than the alternative of implementing in upper-level call places of FilePrefetchBuffer for table open. It also has the benefit of extensible to other types of FilePrefetchBuffer if needed. See db bench for perf regression concern.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11265
Test Plan:
**- Piggyback on existing test**
**- rocksdb.table.open.prefetch.tail.miss is harder to UT so I manually set prefetch tail read bytes to be small and run db bench.**
```
./db_bench -db=/tmp/testdb -statistics=true -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000 -write_buffer_size=655 -target_file_size_base=655 -disable_auto_compactions=false -compression_type=none -bloom_bits=3 -use_direct_reads=true
```
```
rocksdb.table.open.prefetch.tail.read.bytes P50 : 4096.000000 P95 : 4096.000000 P99 : 4096.000000 P100 : 4096.000000 COUNT : 225 SUM : 921600
rocksdb.table.open.prefetch.tail.miss COUNT : 91
rocksdb.table.open.prefetch.tail.hit COUNT : 1034
```
**- No perf regression observed in db_bench**
SETUP command: create same db with ~900 files for pre-change/post-change.
```
./db_bench -db=/tmp/testdb -benchmarks="fillseq" -key_size=32 -value_size=512 -num=500000 -write_buffer_size=655360 -disable_auto_compactions=true -target_file_size_base=16777216 -compression_type=none
```
TEST command 60 runs or til convergence: as suggested by anand1976 and akankshamahajan15, vary `seek_nexts` and `async_io` in testing.
```
./db_bench -use_existing_db=true -db=/tmp/testdb -statistics=false -cache_size=0 -cache_index_and_filter_blocks=false -benchmarks=seekrandom[-X60] -num=50000 -seek_nexts={10, 500, 1000} -async_io={0|1} -use_direct_reads=true
```
async io = 0, direct io read = true
| seek_nexts = 10, 30 runs | seek_nexts = 500, 12 runs | seek_nexts = 1000, 6 runs
-- | -- | -- | --
pre-post change | 4776 (± 28) ops/sec; 24.8 (± 0.1) MB/sec | 288 (± 1) ops/sec; 74.8 (± 0.4) MB/sec | 145 (± 4) ops/sec; 75.6 (± 2.2) MB/sec
post-change | 4790 (± 32) ops/sec; 24.9 (± 0.2) MB/sec | 288 (± 3) ops/sec; 74.7 (± 0.8) MB/sec | 143 (± 3) ops/sec; 74.5 (± 1.6) MB/sec
async io = 1, direct io read = true
| seek_nexts = 10, 54 runs | seek_nexts = 500, 6 runs | seek_nexts = 1000, 4 runs
-- | -- | -- | --
pre-post change | 3350 (± 36) ops/sec; 17.4 (± 0.2) MB/sec | 264 (± 0) ops/sec; 68.7 (± 0.2) MB/sec | 138 (± 1) ops/sec; 71.8 (± 1.0) MB/sec
post-change | 3358 (± 27) ops/sec; 17.4 (± 0.1) MB/sec | 263 (± 2) ops/sec; 68.3 (± 0.8) MB/sec | 139 (± 1) ops/sec; 72.6 (± 0.6) MB/sec
Reviewed By: ajkr
Differential Revision: D43781467
Pulled By: hx235
fbshipit-source-id: a706a18472a8edb2b952bac3af40eec803537f2a
2023-03-15 21:02:43 +00:00
|
|
|
|
|
|
|
FilePrefetchBufferUsage usage_;
|
2023-08-18 22:52:04 +00:00
|
|
|
|
2023-12-06 21:48:15 +00:00
|
|
|
std::function<void(bool, uint64_t&, uint64_t&)> readaheadsize_cb_;
|
2024-01-05 17:29:01 +00:00
|
|
|
|
|
|
|
// num_buffers_ is the number of buffers maintained by FilePrefetchBuffer to
|
|
|
|
// prefetch the data at a time.
|
|
|
|
size_t num_buffers_;
|
2019-09-16 17:31:27 +00:00
|
|
|
};
|
2020-02-20 20:07:53 +00:00
|
|
|
} // namespace ROCKSDB_NAMESPACE
|