2019-09-16 17:31:27 +00:00
|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
//
|
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
|
|
|
|
#include "file/random_access_file_reader.h"
|
|
|
|
|
|
|
|
#include <algorithm>
|
|
|
|
#include <mutex>
|
|
|
|
|
2021-01-26 06:07:26 +00:00
|
|
|
#include "file/file_util.h"
|
2019-09-16 17:31:27 +00:00
|
|
|
#include "monitoring/histogram.h"
|
|
|
|
#include "monitoring/iostats_context_imp.h"
|
|
|
|
#include "port/port.h"
|
2020-04-30 21:48:51 +00:00
|
|
|
#include "table/format.h"
|
2019-09-16 17:31:27 +00:00
|
|
|
#include "test_util/sync_point.h"
|
|
|
|
#include "util/random.h"
|
2023-05-17 18:27:09 +00:00
|
|
|
#include "util/rate_limiter_impl.h"
|
2019-09-16 17:31:27 +00:00
|
|
|
|
2020-02-20 20:07:53 +00:00
|
|
|
namespace ROCKSDB_NAMESPACE {
|
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444)
Summary:
**Context/Summary:**
- Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`.
- For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation)
- New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main.
- Misc
- More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority`
- Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444
Test Plan:
- CI fake db crash/stress test
- Microbenchmarking
**Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench`
- google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd
- db_basic_bench_base: upstream
- db_basic_bench_pr: db_basic_bench_base + this PR
- asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read)
- asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR
**Test**
Get
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000
```
Result
```
Coming soon
```
AsyncRead
```
TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out
```
Result
```
Base:
1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008,
PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before):
1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053,
```
Reviewed By: ajkr
Differential Revision: D45918925
Pulled By: hx235
fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
|
|
|
inline Histograms GetFileReadHistograms(Statistics* stats,
|
|
|
|
Env::IOActivity io_activity) {
|
|
|
|
switch (io_activity) {
|
|
|
|
case Env::IOActivity::kFlush:
|
|
|
|
return Histograms::FILE_READ_FLUSH_MICROS;
|
|
|
|
case Env::IOActivity::kCompaction:
|
|
|
|
return Histograms::FILE_READ_COMPACTION_MICROS;
|
|
|
|
case Env::IOActivity::kDBOpen:
|
|
|
|
return Histograms::FILE_READ_DB_OPEN_MICROS;
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (stats && stats->get_stats_level() > StatsLevel::kExceptDetailedTimers) {
|
|
|
|
switch (io_activity) {
|
|
|
|
case Env::IOActivity::kGet:
|
|
|
|
return Histograms::FILE_READ_GET_MICROS;
|
|
|
|
case Env::IOActivity::kMultiGet:
|
|
|
|
return Histograms::FILE_READ_MULTIGET_MICROS;
|
|
|
|
case Env::IOActivity::kDBIterator:
|
|
|
|
return Histograms::FILE_READ_DB_ITERATOR_MICROS;
|
|
|
|
case Env::IOActivity::kVerifyDBChecksum:
|
|
|
|
return Histograms::FILE_READ_VERIFY_DB_CHECKSUM_MICROS;
|
|
|
|
case Env::IOActivity::kVerifyFileChecksums:
|
|
|
|
return Histograms::FILE_READ_VERIFY_FILE_CHECKSUMS_MICROS;
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return Histograms::HISTOGRAM_ENUM_MAX;
|
|
|
|
}
|
2022-02-18 21:35:36 +00:00
|
|
|
inline void RecordIOStats(Statistics* stats, Temperature file_temperature,
|
|
|
|
bool is_last_level, size_t size) {
|
|
|
|
IOSTATS_ADD(bytes_read, size);
|
|
|
|
// record for last/non-last level
|
|
|
|
if (is_last_level) {
|
|
|
|
RecordTick(stats, LAST_LEVEL_READ_BYTES, size);
|
|
|
|
RecordTick(stats, LAST_LEVEL_READ_COUNT, 1);
|
|
|
|
} else {
|
|
|
|
RecordTick(stats, NON_LAST_LEVEL_READ_BYTES, size);
|
|
|
|
RecordTick(stats, NON_LAST_LEVEL_READ_COUNT, 1);
|
2021-11-16 23:15:48 +00:00
|
|
|
}
|
|
|
|
|
2022-02-18 21:35:36 +00:00
|
|
|
// record for temperature file
|
|
|
|
if (file_temperature != Temperature::kUnknown) {
|
|
|
|
switch (file_temperature) {
|
|
|
|
case Temperature::kHot:
|
|
|
|
IOSTATS_ADD(file_io_stats_by_temperature.hot_file_bytes_read, size);
|
|
|
|
IOSTATS_ADD(file_io_stats_by_temperature.hot_file_read_count, 1);
|
|
|
|
RecordTick(stats, HOT_FILE_READ_BYTES, size);
|
|
|
|
RecordTick(stats, HOT_FILE_READ_COUNT, 1);
|
|
|
|
break;
|
|
|
|
case Temperature::kWarm:
|
|
|
|
IOSTATS_ADD(file_io_stats_by_temperature.warm_file_bytes_read, size);
|
|
|
|
IOSTATS_ADD(file_io_stats_by_temperature.warm_file_read_count, 1);
|
|
|
|
RecordTick(stats, WARM_FILE_READ_BYTES, size);
|
|
|
|
RecordTick(stats, WARM_FILE_READ_COUNT, 1);
|
|
|
|
break;
|
|
|
|
case Temperature::kCold:
|
|
|
|
IOSTATS_ADD(file_io_stats_by_temperature.cold_file_bytes_read, size);
|
|
|
|
IOSTATS_ADD(file_io_stats_by_temperature.cold_file_read_count, 1);
|
|
|
|
RecordTick(stats, COLD_FILE_READ_BYTES, size);
|
|
|
|
RecordTick(stats, COLD_FILE_READ_COUNT, 1);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
2021-11-16 23:15:48 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-04-01 17:06:55 +00:00
|
|
|
IOStatus RandomAccessFileReader::Create(
|
2021-01-29 06:08:46 +00:00
|
|
|
const std::shared_ptr<FileSystem>& fs, const std::string& fname,
|
|
|
|
const FileOptions& file_opts,
|
|
|
|
std::unique_ptr<RandomAccessFileReader>* reader, IODebugContext* dbg) {
|
|
|
|
std::unique_ptr<FSRandomAccessFile> file;
|
2021-04-01 17:06:55 +00:00
|
|
|
IOStatus io_s = fs->NewRandomAccessFile(fname, file_opts, &file, dbg);
|
|
|
|
if (io_s.ok()) {
|
2021-01-29 06:08:46 +00:00
|
|
|
reader->reset(new RandomAccessFileReader(std::move(file), fname));
|
|
|
|
}
|
2021-04-01 17:06:55 +00:00
|
|
|
return io_s;
|
2021-01-29 06:08:46 +00:00
|
|
|
}
|
Support direct IO in RandomAccessFileReader::MultiRead (#6446)
Summary:
By supporting direct IO in RandomAccessFileReader::MultiRead, the benefits of parallel IO (IO uring) and direct IO can be combined.
In direct IO mode, read requests are aligned and merged together before being issued to RandomAccessFile::MultiRead, so blocks in the original requests might share the same underlying buffer, the shared buffers are returned in `aligned_bufs`, which is a new parameter of the `MultiRead` API.
For example, suppose alignment requirement for direct IO is 4KB, one request is (offset: 1KB, len: 1KB), another request is (offset: 3KB, len: 1KB), then since they all belong to page (offset: 0, len: 4KB), `MultiRead` only reads the page with direct IO into a buffer on heap, and returns 2 Slices referencing regions in that same buffer. See `random_access_file_reader_test.cc` for more examples.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6446
Test Plan: Added a new test `random_access_file_reader_test.cc`.
Reviewed By: anand1976
Differential Revision: D20097518
Pulled By: cheng-chang
fbshipit-source-id: ca48a8faf9c3af146465c102ef6b266a363e78d1
2020-03-20 23:15:40 +00:00
|
|
|
|
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444)
Summary:
**Context/Summary:**
- Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`.
- For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation)
- New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main.
- Misc
- More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority`
- Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444
Test Plan:
- CI fake db crash/stress test
- Microbenchmarking
**Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench`
- google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd
- db_basic_bench_base: upstream
- db_basic_bench_pr: db_basic_bench_base + this PR
- asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read)
- asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR
**Test**
Get
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000
```
Result
```
Coming soon
```
AsyncRead
```
TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out
```
Result
```
Base:
1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008,
PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before):
1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053,
```
Reviewed By: ajkr
Differential Revision: D45918925
Pulled By: hx235
fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
|
|
|
IOStatus RandomAccessFileReader::Read(const IOOptions& opts, uint64_t offset,
|
|
|
|
size_t n, Slice* result, char* scratch,
|
|
|
|
AlignedBuf* aligned_buf) const {
|
Support direct IO in RandomAccessFileReader::MultiRead (#6446)
Summary:
By supporting direct IO in RandomAccessFileReader::MultiRead, the benefits of parallel IO (IO uring) and direct IO can be combined.
In direct IO mode, read requests are aligned and merged together before being issued to RandomAccessFile::MultiRead, so blocks in the original requests might share the same underlying buffer, the shared buffers are returned in `aligned_bufs`, which is a new parameter of the `MultiRead` API.
For example, suppose alignment requirement for direct IO is 4KB, one request is (offset: 1KB, len: 1KB), another request is (offset: 3KB, len: 1KB), then since they all belong to page (offset: 0, len: 4KB), `MultiRead` only reads the page with direct IO into a buffer on heap, and returns 2 Slices referencing regions in that same buffer. See `random_access_file_reader_test.cc` for more examples.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6446
Test Plan: Added a new test `random_access_file_reader_test.cc`.
Reviewed By: anand1976
Differential Revision: D20097518
Pulled By: cheng-chang
fbshipit-source-id: ca48a8faf9c3af146465c102ef6b266a363e78d1
2020-03-20 23:15:40 +00:00
|
|
|
(void)aligned_buf;
|
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444)
Summary:
**Context/Summary:**
- Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`.
- For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation)
- New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main.
- Misc
- More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority`
- Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444
Test Plan:
- CI fake db crash/stress test
- Microbenchmarking
**Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench`
- google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd
- db_basic_bench_base: upstream
- db_basic_bench_pr: db_basic_bench_base + this PR
- asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read)
- asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR
**Test**
Get
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000
```
Result
```
Coming soon
```
AsyncRead
```
TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out
```
Result
```
Base:
1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008,
PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before):
1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053,
```
Reviewed By: ajkr
Differential Revision: D45918925
Pulled By: hx235
fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
|
|
|
const Env::IOPriority rate_limiter_priority = opts.rate_limiter_priority;
|
2020-05-13 01:21:32 +00:00
|
|
|
|
|
|
|
TEST_SYNC_POINT_CALLBACK("RandomAccessFileReader::Read", nullptr);
|
2021-09-28 04:28:24 +00:00
|
|
|
|
|
|
|
// To be paranoid: modify scratch a little bit, so in case underlying
|
|
|
|
// FileSystem doesn't fill the buffer but return success and `scratch` returns
|
|
|
|
// contains a previous block, returned value will not pass checksum.
|
|
|
|
if (n > 0 && scratch != nullptr) {
|
|
|
|
// This byte might not change anything for direct I/O case, but it's OK.
|
|
|
|
scratch[0]++;
|
|
|
|
}
|
|
|
|
|
2021-04-01 17:06:55 +00:00
|
|
|
IOStatus io_s;
|
2019-09-16 17:31:27 +00:00
|
|
|
uint64_t elapsed = 0;
|
2023-07-15 03:08:05 +00:00
|
|
|
size_t alignment = file_->GetRequiredBufferAlignment();
|
|
|
|
bool is_aligned = false;
|
|
|
|
if (scratch != nullptr) {
|
|
|
|
// Check if offset, length and buffer are aligned.
|
|
|
|
is_aligned = (offset & (alignment - 1)) == 0 &&
|
|
|
|
(n & (alignment - 1)) == 0 &&
|
|
|
|
(uintptr_t(scratch) & (alignment - 1)) == 0;
|
|
|
|
}
|
|
|
|
|
2019-09-16 17:31:27 +00:00
|
|
|
{
|
2021-01-26 06:07:26 +00:00
|
|
|
StopWatch sw(clock_, stats_, hist_type_,
|
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444)
Summary:
**Context/Summary:**
- Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`.
- For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation)
- New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main.
- Misc
- More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority`
- Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444
Test Plan:
- CI fake db crash/stress test
- Microbenchmarking
**Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench`
- google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd
- db_basic_bench_base: upstream
- db_basic_bench_pr: db_basic_bench_base + this PR
- asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read)
- asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR
**Test**
Get
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000
```
Result
```
Coming soon
```
AsyncRead
```
TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out
```
Result
```
Base:
1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008,
PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before):
1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053,
```
Reviewed By: ajkr
Differential Revision: D45918925
Pulled By: hx235
fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
|
|
|
GetFileReadHistograms(stats_, opts.io_activity),
|
2019-09-16 17:31:27 +00:00
|
|
|
(stats_ != nullptr) ? &elapsed : nullptr, true /*overwrite*/,
|
|
|
|
true /*delay_enabled*/);
|
|
|
|
auto prev_perf_level = GetPerfLevel();
|
|
|
|
IOSTATS_TIMER_GUARD(read_nanos);
|
2023-07-15 03:08:05 +00:00
|
|
|
if (use_direct_io() && is_aligned == false) {
|
2019-09-16 17:31:27 +00:00
|
|
|
size_t aligned_offset =
|
|
|
|
TruncateToPageBoundary(alignment, static_cast<size_t>(offset));
|
|
|
|
size_t offset_advance = static_cast<size_t>(offset) - aligned_offset;
|
|
|
|
size_t read_size =
|
|
|
|
Roundup(static_cast<size_t>(offset + n), alignment) - aligned_offset;
|
|
|
|
AlignedBuffer buf;
|
|
|
|
buf.Alignment(alignment);
|
|
|
|
buf.AllocateNewBuffer(read_size);
|
|
|
|
while (buf.CurrentSize() < read_size) {
|
|
|
|
size_t allowed;
|
2022-02-17 07:17:03 +00:00
|
|
|
if (rate_limiter_priority != Env::IO_TOTAL &&
|
|
|
|
rate_limiter_ != nullptr) {
|
2019-09-16 17:31:27 +00:00
|
|
|
allowed = rate_limiter_->RequestToken(
|
|
|
|
buf.Capacity() - buf.CurrentSize(), buf.Alignment(),
|
2022-02-17 07:17:03 +00:00
|
|
|
rate_limiter_priority, stats_, RateLimiter::OpType::kRead);
|
2019-09-16 17:31:27 +00:00
|
|
|
} else {
|
|
|
|
assert(buf.CurrentSize() == 0);
|
|
|
|
allowed = read_size;
|
|
|
|
}
|
|
|
|
Slice tmp;
|
|
|
|
|
2020-07-22 15:53:21 +00:00
|
|
|
FileOperationInfo::StartTimePoint start_ts;
|
2019-09-16 17:31:27 +00:00
|
|
|
uint64_t orig_offset = 0;
|
|
|
|
if (ShouldNotifyListeners()) {
|
2020-07-22 15:53:21 +00:00
|
|
|
start_ts = FileOperationInfo::StartNow();
|
2019-09-16 17:31:27 +00:00
|
|
|
orig_offset = aligned_offset + buf.CurrentSize();
|
|
|
|
}
|
2020-04-30 21:48:51 +00:00
|
|
|
|
2019-09-16 17:31:27 +00:00
|
|
|
{
|
2021-01-26 06:07:26 +00:00
|
|
|
IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, clock_);
|
2020-04-30 21:48:51 +00:00
|
|
|
// Only user reads are expected to specify a timeout. And user reads
|
|
|
|
// are not subjected to rate_limiter and should go through only
|
|
|
|
// one iteration of this loop, so we don't need to check and adjust
|
|
|
|
// the opts.timeout before calling file_->Read
|
|
|
|
assert(!opts.timeout.count() || allowed == read_size);
|
2021-04-01 17:06:55 +00:00
|
|
|
io_s = file_->Read(aligned_offset + buf.CurrentSize(), allowed, opts,
|
|
|
|
&tmp, buf.Destination(), nullptr);
|
2019-09-16 17:31:27 +00:00
|
|
|
}
|
|
|
|
if (ShouldNotifyListeners()) {
|
2020-07-22 15:53:21 +00:00
|
|
|
auto finish_ts = FileOperationInfo::FinishNow();
|
2019-09-16 17:31:27 +00:00
|
|
|
NotifyOnFileReadFinish(orig_offset, tmp.size(), start_ts, finish_ts,
|
2021-04-01 17:06:55 +00:00
|
|
|
io_s);
|
2021-11-19 01:09:54 +00:00
|
|
|
if (!io_s.ok()) {
|
|
|
|
NotifyOnIOError(io_s, FileOperationType::kRead, file_name(),
|
|
|
|
tmp.size(), orig_offset);
|
|
|
|
}
|
2019-09-16 17:31:27 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
buf.Size(buf.CurrentSize() + tmp.size());
|
2021-04-01 17:06:55 +00:00
|
|
|
if (!io_s.ok() || tmp.size() < allowed) {
|
2019-09-16 17:31:27 +00:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
size_t res_len = 0;
|
2021-04-01 17:06:55 +00:00
|
|
|
if (io_s.ok() && offset_advance < buf.CurrentSize()) {
|
2020-03-06 22:02:09 +00:00
|
|
|
res_len = std::min(buf.CurrentSize() - offset_advance, n);
|
Support direct IO in RandomAccessFileReader::MultiRead (#6446)
Summary:
By supporting direct IO in RandomAccessFileReader::MultiRead, the benefits of parallel IO (IO uring) and direct IO can be combined.
In direct IO mode, read requests are aligned and merged together before being issued to RandomAccessFile::MultiRead, so blocks in the original requests might share the same underlying buffer, the shared buffers are returned in `aligned_bufs`, which is a new parameter of the `MultiRead` API.
For example, suppose alignment requirement for direct IO is 4KB, one request is (offset: 1KB, len: 1KB), another request is (offset: 3KB, len: 1KB), then since they all belong to page (offset: 0, len: 4KB), `MultiRead` only reads the page with direct IO into a buffer on heap, and returns 2 Slices referencing regions in that same buffer. See `random_access_file_reader_test.cc` for more examples.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6446
Test Plan: Added a new test `random_access_file_reader_test.cc`.
Reviewed By: anand1976
Differential Revision: D20097518
Pulled By: cheng-chang
fbshipit-source-id: ca48a8faf9c3af146465c102ef6b266a363e78d1
2020-03-20 23:15:40 +00:00
|
|
|
if (aligned_buf == nullptr) {
|
2020-03-06 22:02:09 +00:00
|
|
|
buf.Read(scratch, offset_advance, res_len);
|
|
|
|
} else {
|
2020-04-09 04:17:42 +00:00
|
|
|
scratch = buf.BufferStart() + offset_advance;
|
Support direct IO in RandomAccessFileReader::MultiRead (#6446)
Summary:
By supporting direct IO in RandomAccessFileReader::MultiRead, the benefits of parallel IO (IO uring) and direct IO can be combined.
In direct IO mode, read requests are aligned and merged together before being issued to RandomAccessFile::MultiRead, so blocks in the original requests might share the same underlying buffer, the shared buffers are returned in `aligned_bufs`, which is a new parameter of the `MultiRead` API.
For example, suppose alignment requirement for direct IO is 4KB, one request is (offset: 1KB, len: 1KB), another request is (offset: 3KB, len: 1KB), then since they all belong to page (offset: 0, len: 4KB), `MultiRead` only reads the page with direct IO into a buffer on heap, and returns 2 Slices referencing regions in that same buffer. See `random_access_file_reader_test.cc` for more examples.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6446
Test Plan: Added a new test `random_access_file_reader_test.cc`.
Reviewed By: anand1976
Differential Revision: D20097518
Pulled By: cheng-chang
fbshipit-source-id: ca48a8faf9c3af146465c102ef6b266a363e78d1
2020-03-20 23:15:40 +00:00
|
|
|
aligned_buf->reset(buf.Release());
|
2020-03-06 22:02:09 +00:00
|
|
|
}
|
2019-09-16 17:31:27 +00:00
|
|
|
}
|
|
|
|
*result = Slice(scratch, res_len);
|
|
|
|
} else {
|
|
|
|
size_t pos = 0;
|
|
|
|
const char* res_scratch = nullptr;
|
|
|
|
while (pos < n) {
|
|
|
|
size_t allowed;
|
2022-02-17 07:17:03 +00:00
|
|
|
if (rate_limiter_priority != Env::IO_TOTAL &&
|
|
|
|
rate_limiter_ != nullptr) {
|
2019-09-16 17:31:27 +00:00
|
|
|
if (rate_limiter_->IsRateLimited(RateLimiter::OpType::kRead)) {
|
|
|
|
sw.DelayStart();
|
|
|
|
}
|
2023-07-15 03:08:05 +00:00
|
|
|
allowed = rate_limiter_->RequestToken(
|
|
|
|
n - pos, (use_direct_io() ? alignment : 0), rate_limiter_priority,
|
|
|
|
stats_, RateLimiter::OpType::kRead);
|
2019-09-16 17:31:27 +00:00
|
|
|
if (rate_limiter_->IsRateLimited(RateLimiter::OpType::kRead)) {
|
|
|
|
sw.DelayStop();
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
allowed = n;
|
|
|
|
}
|
|
|
|
Slice tmp_result;
|
|
|
|
|
2020-07-22 15:53:21 +00:00
|
|
|
FileOperationInfo::StartTimePoint start_ts;
|
2019-09-16 17:31:27 +00:00
|
|
|
if (ShouldNotifyListeners()) {
|
2020-07-22 15:53:21 +00:00
|
|
|
start_ts = FileOperationInfo::StartNow();
|
2019-09-16 17:31:27 +00:00
|
|
|
}
|
2020-04-30 21:48:51 +00:00
|
|
|
|
2019-09-16 17:31:27 +00:00
|
|
|
{
|
2021-01-26 06:07:26 +00:00
|
|
|
IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, clock_);
|
2020-04-30 21:48:51 +00:00
|
|
|
// Only user reads are expected to specify a timeout. And user reads
|
|
|
|
// are not subjected to rate_limiter and should go through only
|
|
|
|
// one iteration of this loop, so we don't need to check and adjust
|
|
|
|
// the opts.timeout before calling file_->Read
|
|
|
|
assert(!opts.timeout.count() || allowed == n);
|
2021-04-01 17:06:55 +00:00
|
|
|
io_s = file_->Read(offset + pos, allowed, opts, &tmp_result,
|
|
|
|
scratch + pos, nullptr);
|
2019-09-16 17:31:27 +00:00
|
|
|
}
|
|
|
|
if (ShouldNotifyListeners()) {
|
2020-07-22 15:53:21 +00:00
|
|
|
auto finish_ts = FileOperationInfo::FinishNow();
|
2019-09-16 17:31:27 +00:00
|
|
|
NotifyOnFileReadFinish(offset + pos, tmp_result.size(), start_ts,
|
2021-04-01 17:06:55 +00:00
|
|
|
finish_ts, io_s);
|
2021-11-19 01:09:54 +00:00
|
|
|
|
|
|
|
if (!io_s.ok()) {
|
|
|
|
NotifyOnIOError(io_s, FileOperationType::kRead, file_name(),
|
|
|
|
tmp_result.size(), offset + pos);
|
|
|
|
}
|
2019-09-16 17:31:27 +00:00
|
|
|
}
|
|
|
|
if (res_scratch == nullptr) {
|
|
|
|
// we can't simply use `scratch` because reads of mmap'd files return
|
|
|
|
// data in a different buffer.
|
|
|
|
res_scratch = tmp_result.data();
|
|
|
|
} else {
|
|
|
|
// make sure chunks are inserted contiguously into `res_scratch`.
|
|
|
|
assert(tmp_result.data() == res_scratch + pos);
|
|
|
|
}
|
|
|
|
pos += tmp_result.size();
|
2021-04-01 17:06:55 +00:00
|
|
|
if (!io_s.ok() || tmp_result.size() < allowed) {
|
2019-09-16 17:31:27 +00:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2021-04-01 17:06:55 +00:00
|
|
|
*result = Slice(res_scratch, io_s.ok() ? pos : 0);
|
2019-09-16 17:31:27 +00:00
|
|
|
}
|
2022-02-18 21:35:36 +00:00
|
|
|
RecordIOStats(stats_, file_temperature_, is_last_level_, result->size());
|
2019-09-16 17:31:27 +00:00
|
|
|
SetPerfLevel(prev_perf_level);
|
|
|
|
}
|
|
|
|
if (stats_ != nullptr && file_read_hist_ != nullptr) {
|
|
|
|
file_read_hist_->Add(elapsed);
|
|
|
|
}
|
|
|
|
|
2023-09-06 17:23:41 +00:00
|
|
|
#ifndef NDEBUG
|
|
|
|
auto pair = std::make_pair(&file_name_, &io_s);
|
|
|
|
if (offset == 0) {
|
|
|
|
TEST_SYNC_POINT_CALLBACK("RandomAccessFileReader::Read::BeforeReturn",
|
|
|
|
&pair);
|
|
|
|
}
|
|
|
|
TEST_SYNC_POINT_CALLBACK("RandomAccessFileReader::Read::AnyOffset", &pair);
|
|
|
|
#endif
|
2021-04-01 17:06:55 +00:00
|
|
|
return io_s;
|
2019-09-16 17:31:27 +00:00
|
|
|
}
|
|
|
|
|
Support direct IO in RandomAccessFileReader::MultiRead (#6446)
Summary:
By supporting direct IO in RandomAccessFileReader::MultiRead, the benefits of parallel IO (IO uring) and direct IO can be combined.
In direct IO mode, read requests are aligned and merged together before being issued to RandomAccessFile::MultiRead, so blocks in the original requests might share the same underlying buffer, the shared buffers are returned in `aligned_bufs`, which is a new parameter of the `MultiRead` API.
For example, suppose alignment requirement for direct IO is 4KB, one request is (offset: 1KB, len: 1KB), another request is (offset: 3KB, len: 1KB), then since they all belong to page (offset: 0, len: 4KB), `MultiRead` only reads the page with direct IO into a buffer on heap, and returns 2 Slices referencing regions in that same buffer. See `random_access_file_reader_test.cc` for more examples.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6446
Test Plan: Added a new test `random_access_file_reader_test.cc`.
Reviewed By: anand1976
Differential Revision: D20097518
Pulled By: cheng-chang
fbshipit-source-id: ca48a8faf9c3af146465c102ef6b266a363e78d1
2020-03-20 23:15:40 +00:00
|
|
|
size_t End(const FSReadRequest& r) {
|
|
|
|
return static_cast<size_t>(r.offset) + r.len;
|
|
|
|
}
|
|
|
|
|
|
|
|
FSReadRequest Align(const FSReadRequest& r, size_t alignment) {
|
|
|
|
FSReadRequest req;
|
|
|
|
req.offset = static_cast<uint64_t>(
|
2022-10-25 01:34:52 +00:00
|
|
|
TruncateToPageBoundary(alignment, static_cast<size_t>(r.offset)));
|
Support direct IO in RandomAccessFileReader::MultiRead (#6446)
Summary:
By supporting direct IO in RandomAccessFileReader::MultiRead, the benefits of parallel IO (IO uring) and direct IO can be combined.
In direct IO mode, read requests are aligned and merged together before being issued to RandomAccessFile::MultiRead, so blocks in the original requests might share the same underlying buffer, the shared buffers are returned in `aligned_bufs`, which is a new parameter of the `MultiRead` API.
For example, suppose alignment requirement for direct IO is 4KB, one request is (offset: 1KB, len: 1KB), another request is (offset: 3KB, len: 1KB), then since they all belong to page (offset: 0, len: 4KB), `MultiRead` only reads the page with direct IO into a buffer on heap, and returns 2 Slices referencing regions in that same buffer. See `random_access_file_reader_test.cc` for more examples.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6446
Test Plan: Added a new test `random_access_file_reader_test.cc`.
Reviewed By: anand1976
Differential Revision: D20097518
Pulled By: cheng-chang
fbshipit-source-id: ca48a8faf9c3af146465c102ef6b266a363e78d1
2020-03-20 23:15:40 +00:00
|
|
|
req.len = Roundup(End(r), alignment) - req.offset;
|
2020-03-24 03:12:38 +00:00
|
|
|
req.scratch = nullptr;
|
Support direct IO in RandomAccessFileReader::MultiRead (#6446)
Summary:
By supporting direct IO in RandomAccessFileReader::MultiRead, the benefits of parallel IO (IO uring) and direct IO can be combined.
In direct IO mode, read requests are aligned and merged together before being issued to RandomAccessFile::MultiRead, so blocks in the original requests might share the same underlying buffer, the shared buffers are returned in `aligned_bufs`, which is a new parameter of the `MultiRead` API.
For example, suppose alignment requirement for direct IO is 4KB, one request is (offset: 1KB, len: 1KB), another request is (offset: 3KB, len: 1KB), then since they all belong to page (offset: 0, len: 4KB), `MultiRead` only reads the page with direct IO into a buffer on heap, and returns 2 Slices referencing regions in that same buffer. See `random_access_file_reader_test.cc` for more examples.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6446
Test Plan: Added a new test `random_access_file_reader_test.cc`.
Reviewed By: anand1976
Differential Revision: D20097518
Pulled By: cheng-chang
fbshipit-source-id: ca48a8faf9c3af146465c102ef6b266a363e78d1
2020-03-20 23:15:40 +00:00
|
|
|
return req;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool TryMerge(FSReadRequest* dest, const FSReadRequest& src) {
|
|
|
|
size_t dest_offset = static_cast<size_t>(dest->offset);
|
|
|
|
size_t src_offset = static_cast<size_t>(src.offset);
|
|
|
|
size_t dest_end = End(*dest);
|
|
|
|
size_t src_end = End(src);
|
2020-07-22 22:02:10 +00:00
|
|
|
if (std::max(dest_offset, src_offset) > std::min(dest_end, src_end)) {
|
Support direct IO in RandomAccessFileReader::MultiRead (#6446)
Summary:
By supporting direct IO in RandomAccessFileReader::MultiRead, the benefits of parallel IO (IO uring) and direct IO can be combined.
In direct IO mode, read requests are aligned and merged together before being issued to RandomAccessFile::MultiRead, so blocks in the original requests might share the same underlying buffer, the shared buffers are returned in `aligned_bufs`, which is a new parameter of the `MultiRead` API.
For example, suppose alignment requirement for direct IO is 4KB, one request is (offset: 1KB, len: 1KB), another request is (offset: 3KB, len: 1KB), then since they all belong to page (offset: 0, len: 4KB), `MultiRead` only reads the page with direct IO into a buffer on heap, and returns 2 Slices referencing regions in that same buffer. See `random_access_file_reader_test.cc` for more examples.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6446
Test Plan: Added a new test `random_access_file_reader_test.cc`.
Reviewed By: anand1976
Differential Revision: D20097518
Pulled By: cheng-chang
fbshipit-source-id: ca48a8faf9c3af146465c102ef6b266a363e78d1
2020-03-20 23:15:40 +00:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
dest->offset = static_cast<uint64_t>(std::min(dest_offset, src_offset));
|
|
|
|
dest->len = std::max(dest_end, src_end) - dest->offset;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444)
Summary:
**Context/Summary:**
- Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`.
- For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation)
- New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main.
- Misc
- More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority`
- Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444
Test Plan:
- CI fake db crash/stress test
- Microbenchmarking
**Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench`
- google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd
- db_basic_bench_base: upstream
- db_basic_bench_pr: db_basic_bench_base + this PR
- asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read)
- asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR
**Test**
Get
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000
```
Result
```
Coming soon
```
AsyncRead
```
TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out
```
Result
```
Base:
1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008,
PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before):
1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053,
```
Reviewed By: ajkr
Differential Revision: D45918925
Pulled By: hx235
fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
|
|
|
IOStatus RandomAccessFileReader::MultiRead(const IOOptions& opts,
|
|
|
|
FSReadRequest* read_reqs,
|
|
|
|
size_t num_reqs,
|
|
|
|
AlignedBuf* aligned_buf) const {
|
Support direct IO in RandomAccessFileReader::MultiRead (#6446)
Summary:
By supporting direct IO in RandomAccessFileReader::MultiRead, the benefits of parallel IO (IO uring) and direct IO can be combined.
In direct IO mode, read requests are aligned and merged together before being issued to RandomAccessFile::MultiRead, so blocks in the original requests might share the same underlying buffer, the shared buffers are returned in `aligned_bufs`, which is a new parameter of the `MultiRead` API.
For example, suppose alignment requirement for direct IO is 4KB, one request is (offset: 1KB, len: 1KB), another request is (offset: 3KB, len: 1KB), then since they all belong to page (offset: 0, len: 4KB), `MultiRead` only reads the page with direct IO into a buffer on heap, and returns 2 Slices referencing regions in that same buffer. See `random_access_file_reader_test.cc` for more examples.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6446
Test Plan: Added a new test `random_access_file_reader_test.cc`.
Reviewed By: anand1976
Differential Revision: D20097518
Pulled By: cheng-chang
fbshipit-source-id: ca48a8faf9c3af146465c102ef6b266a363e78d1
2020-03-20 23:15:40 +00:00
|
|
|
(void)aligned_buf; // suppress warning of unused variable in LITE mode
|
|
|
|
assert(num_reqs > 0);
|
2021-09-28 04:28:24 +00:00
|
|
|
|
2021-09-29 21:25:43 +00:00
|
|
|
#ifndef NDEBUG
|
|
|
|
for (size_t i = 0; i < num_reqs - 1; ++i) {
|
|
|
|
assert(read_reqs[i].offset <= read_reqs[i + 1].offset);
|
|
|
|
}
|
|
|
|
#endif // !NDEBUG
|
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444)
Summary:
**Context/Summary:**
- Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`.
- For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation)
- New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main.
- Misc
- More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority`
- Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444
Test Plan:
- CI fake db crash/stress test
- Microbenchmarking
**Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench`
- google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd
- db_basic_bench_base: upstream
- db_basic_bench_pr: db_basic_bench_base + this PR
- asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read)
- asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR
**Test**
Get
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000
```
Result
```
Coming soon
```
AsyncRead
```
TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out
```
Result
```
Base:
1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008,
PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before):
1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053,
```
Reviewed By: ajkr
Differential Revision: D45918925
Pulled By: hx235
fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
|
|
|
const Env::IOPriority rate_limiter_priority = opts.rate_limiter_priority;
|
2021-09-29 21:25:43 +00:00
|
|
|
|
2021-09-28 04:28:24 +00:00
|
|
|
// To be paranoid modify scratch a little bit, so in case underlying
|
2022-05-24 17:28:57 +00:00
|
|
|
// FileSystem doesn't fill the buffer but return success and `scratch` returns
|
2021-09-28 04:28:24 +00:00
|
|
|
// contains a previous block, returned value will not pass checksum.
|
|
|
|
// This byte might not change anything for direct I/O case, but it's OK.
|
|
|
|
for (size_t i = 0; i < num_reqs; i++) {
|
|
|
|
FSReadRequest& r = read_reqs[i];
|
|
|
|
if (r.len > 0 && r.scratch != nullptr) {
|
|
|
|
r.scratch[0]++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-04-01 17:06:55 +00:00
|
|
|
IOStatus io_s;
|
2019-09-16 17:31:27 +00:00
|
|
|
uint64_t elapsed = 0;
|
|
|
|
{
|
2021-01-26 06:07:26 +00:00
|
|
|
StopWatch sw(clock_, stats_, hist_type_,
|
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444)
Summary:
**Context/Summary:**
- Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`.
- For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation)
- New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main.
- Misc
- More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority`
- Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444
Test Plan:
- CI fake db crash/stress test
- Microbenchmarking
**Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench`
- google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd
- db_basic_bench_base: upstream
- db_basic_bench_pr: db_basic_bench_base + this PR
- asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read)
- asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR
**Test**
Get
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000
```
Result
```
Coming soon
```
AsyncRead
```
TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out
```
Result
```
Base:
1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008,
PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before):
1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053,
```
Reviewed By: ajkr
Differential Revision: D45918925
Pulled By: hx235
fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
|
|
|
GetFileReadHistograms(stats_, opts.io_activity),
|
2019-09-16 17:31:27 +00:00
|
|
|
(stats_ != nullptr) ? &elapsed : nullptr, true /*overwrite*/,
|
|
|
|
true /*delay_enabled*/);
|
|
|
|
auto prev_perf_level = GetPerfLevel();
|
|
|
|
IOSTATS_TIMER_GUARD(read_nanos);
|
|
|
|
|
Support direct IO in RandomAccessFileReader::MultiRead (#6446)
Summary:
By supporting direct IO in RandomAccessFileReader::MultiRead, the benefits of parallel IO (IO uring) and direct IO can be combined.
In direct IO mode, read requests are aligned and merged together before being issued to RandomAccessFile::MultiRead, so blocks in the original requests might share the same underlying buffer, the shared buffers are returned in `aligned_bufs`, which is a new parameter of the `MultiRead` API.
For example, suppose alignment requirement for direct IO is 4KB, one request is (offset: 1KB, len: 1KB), another request is (offset: 3KB, len: 1KB), then since they all belong to page (offset: 0, len: 4KB), `MultiRead` only reads the page with direct IO into a buffer on heap, and returns 2 Slices referencing regions in that same buffer. See `random_access_file_reader_test.cc` for more examples.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6446
Test Plan: Added a new test `random_access_file_reader_test.cc`.
Reviewed By: anand1976
Differential Revision: D20097518
Pulled By: cheng-chang
fbshipit-source-id: ca48a8faf9c3af146465c102ef6b266a363e78d1
2020-03-20 23:15:40 +00:00
|
|
|
FSReadRequest* fs_reqs = read_reqs;
|
|
|
|
size_t num_fs_reqs = num_reqs;
|
|
|
|
std::vector<FSReadRequest> aligned_reqs;
|
|
|
|
if (use_direct_io()) {
|
|
|
|
// num_reqs is the max possible size,
|
|
|
|
// this can reduce std::vecector's internal resize operations.
|
|
|
|
aligned_reqs.reserve(num_reqs);
|
|
|
|
// Align and merge the read requests.
|
|
|
|
size_t alignment = file_->GetRequiredBufferAlignment();
|
2020-12-22 23:08:17 +00:00
|
|
|
for (size_t i = 0; i < num_reqs; i++) {
|
2023-06-23 18:48:49 +00:00
|
|
|
FSReadRequest r = Align(read_reqs[i], alignment);
|
2020-12-22 23:08:17 +00:00
|
|
|
if (i == 0) {
|
|
|
|
// head
|
2023-06-23 18:48:49 +00:00
|
|
|
aligned_reqs.push_back(std::move(r));
|
2020-12-22 23:08:17 +00:00
|
|
|
|
|
|
|
} else if (!TryMerge(&aligned_reqs.back(), r)) {
|
|
|
|
// head + n
|
2023-06-23 18:48:49 +00:00
|
|
|
aligned_reqs.push_back(std::move(r));
|
2020-12-22 23:08:17 +00:00
|
|
|
|
|
|
|
} else {
|
|
|
|
// unused
|
|
|
|
r.status.PermitUncheckedError();
|
Support direct IO in RandomAccessFileReader::MultiRead (#6446)
Summary:
By supporting direct IO in RandomAccessFileReader::MultiRead, the benefits of parallel IO (IO uring) and direct IO can be combined.
In direct IO mode, read requests are aligned and merged together before being issued to RandomAccessFile::MultiRead, so blocks in the original requests might share the same underlying buffer, the shared buffers are returned in `aligned_bufs`, which is a new parameter of the `MultiRead` API.
For example, suppose alignment requirement for direct IO is 4KB, one request is (offset: 1KB, len: 1KB), another request is (offset: 3KB, len: 1KB), then since they all belong to page (offset: 0, len: 4KB), `MultiRead` only reads the page with direct IO into a buffer on heap, and returns 2 Slices referencing regions in that same buffer. See `random_access_file_reader_test.cc` for more examples.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6446
Test Plan: Added a new test `random_access_file_reader_test.cc`.
Reviewed By: anand1976
Differential Revision: D20097518
Pulled By: cheng-chang
fbshipit-source-id: ca48a8faf9c3af146465c102ef6b266a363e78d1
2020-03-20 23:15:40 +00:00
|
|
|
}
|
|
|
|
}
|
2020-07-23 20:48:17 +00:00
|
|
|
TEST_SYNC_POINT_CALLBACK("RandomAccessFileReader::MultiRead:AlignedReqs",
|
|
|
|
&aligned_reqs);
|
Support direct IO in RandomAccessFileReader::MultiRead (#6446)
Summary:
By supporting direct IO in RandomAccessFileReader::MultiRead, the benefits of parallel IO (IO uring) and direct IO can be combined.
In direct IO mode, read requests are aligned and merged together before being issued to RandomAccessFile::MultiRead, so blocks in the original requests might share the same underlying buffer, the shared buffers are returned in `aligned_bufs`, which is a new parameter of the `MultiRead` API.
For example, suppose alignment requirement for direct IO is 4KB, one request is (offset: 1KB, len: 1KB), another request is (offset: 3KB, len: 1KB), then since they all belong to page (offset: 0, len: 4KB), `MultiRead` only reads the page with direct IO into a buffer on heap, and returns 2 Slices referencing regions in that same buffer. See `random_access_file_reader_test.cc` for more examples.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6446
Test Plan: Added a new test `random_access_file_reader_test.cc`.
Reviewed By: anand1976
Differential Revision: D20097518
Pulled By: cheng-chang
fbshipit-source-id: ca48a8faf9c3af146465c102ef6b266a363e78d1
2020-03-20 23:15:40 +00:00
|
|
|
|
|
|
|
// Allocate aligned buffer and let scratch buffers point to it.
|
|
|
|
size_t total_len = 0;
|
|
|
|
for (const auto& r : aligned_reqs) {
|
|
|
|
total_len += r.len;
|
|
|
|
}
|
|
|
|
AlignedBuffer buf;
|
|
|
|
buf.Alignment(alignment);
|
|
|
|
buf.AllocateNewBuffer(total_len);
|
|
|
|
char* scratch = buf.BufferStart();
|
|
|
|
for (auto& r : aligned_reqs) {
|
|
|
|
r.scratch = scratch;
|
|
|
|
scratch += r.len;
|
|
|
|
}
|
|
|
|
|
|
|
|
aligned_buf->reset(buf.Release());
|
|
|
|
fs_reqs = aligned_reqs.data();
|
|
|
|
num_fs_reqs = aligned_reqs.size();
|
|
|
|
}
|
|
|
|
|
2020-07-22 15:53:21 +00:00
|
|
|
FileOperationInfo::StartTimePoint start_ts;
|
2019-09-16 17:31:27 +00:00
|
|
|
if (ShouldNotifyListeners()) {
|
2020-07-22 15:53:21 +00:00
|
|
|
start_ts = FileOperationInfo::StartNow();
|
2019-09-16 17:31:27 +00:00
|
|
|
}
|
2020-04-30 21:48:51 +00:00
|
|
|
|
2019-09-16 17:31:27 +00:00
|
|
|
{
|
2021-01-26 06:07:26 +00:00
|
|
|
IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, clock_);
|
2022-06-17 23:40:47 +00:00
|
|
|
if (rate_limiter_priority != Env::IO_TOTAL && rate_limiter_ != nullptr) {
|
|
|
|
// TODO: ideally we should call `RateLimiter::RequestToken()` for
|
|
|
|
// allowed bytes to multi-read and then consume those bytes by
|
|
|
|
// satisfying as many requests in `MultiRead()` as possible, instead of
|
|
|
|
// what we do here, which can cause burst when the
|
|
|
|
// `total_multi_read_size` is big.
|
|
|
|
size_t total_multi_read_size = 0;
|
|
|
|
assert(fs_reqs != nullptr);
|
|
|
|
for (size_t i = 0; i < num_fs_reqs; ++i) {
|
|
|
|
FSReadRequest& req = fs_reqs[i];
|
|
|
|
total_multi_read_size += req.len;
|
|
|
|
}
|
|
|
|
size_t remaining_bytes = total_multi_read_size;
|
|
|
|
size_t request_bytes = 0;
|
|
|
|
while (remaining_bytes > 0) {
|
|
|
|
request_bytes = std::min(
|
|
|
|
static_cast<size_t>(rate_limiter_->GetSingleBurstBytes()),
|
|
|
|
remaining_bytes);
|
|
|
|
rate_limiter_->Request(request_bytes, rate_limiter_priority,
|
|
|
|
nullptr /* stats */,
|
|
|
|
RateLimiter::OpType::kRead);
|
|
|
|
remaining_bytes -= request_bytes;
|
|
|
|
}
|
|
|
|
}
|
2021-04-01 17:06:55 +00:00
|
|
|
io_s = file_->MultiRead(fs_reqs, num_fs_reqs, opts, nullptr);
|
2022-09-01 04:03:52 +00:00
|
|
|
RecordInHistogram(stats_, MULTIGET_IO_BATCH_SIZE, num_fs_reqs);
|
2019-09-16 17:31:27 +00:00
|
|
|
}
|
Support direct IO in RandomAccessFileReader::MultiRead (#6446)
Summary:
By supporting direct IO in RandomAccessFileReader::MultiRead, the benefits of parallel IO (IO uring) and direct IO can be combined.
In direct IO mode, read requests are aligned and merged together before being issued to RandomAccessFile::MultiRead, so blocks in the original requests might share the same underlying buffer, the shared buffers are returned in `aligned_bufs`, which is a new parameter of the `MultiRead` API.
For example, suppose alignment requirement for direct IO is 4KB, one request is (offset: 1KB, len: 1KB), another request is (offset: 3KB, len: 1KB), then since they all belong to page (offset: 0, len: 4KB), `MultiRead` only reads the page with direct IO into a buffer on heap, and returns 2 Slices referencing regions in that same buffer. See `random_access_file_reader_test.cc` for more examples.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6446
Test Plan: Added a new test `random_access_file_reader_test.cc`.
Reviewed By: anand1976
Differential Revision: D20097518
Pulled By: cheng-chang
fbshipit-source-id: ca48a8faf9c3af146465c102ef6b266a363e78d1
2020-03-20 23:15:40 +00:00
|
|
|
|
|
|
|
if (use_direct_io()) {
|
|
|
|
// Populate results in the unaligned read requests.
|
|
|
|
size_t aligned_i = 0;
|
|
|
|
for (size_t i = 0; i < num_reqs; i++) {
|
|
|
|
auto& r = read_reqs[i];
|
|
|
|
if (static_cast<size_t>(r.offset) > End(aligned_reqs[aligned_i])) {
|
|
|
|
aligned_i++;
|
|
|
|
}
|
|
|
|
const auto& fs_r = fs_reqs[aligned_i];
|
|
|
|
r.status = fs_r.status;
|
|
|
|
if (r.status.ok()) {
|
|
|
|
uint64_t offset = r.offset - fs_r.offset;
|
2021-09-21 19:21:03 +00:00
|
|
|
if (fs_r.result.size() <= offset) {
|
|
|
|
// No byte in the read range is returned.
|
|
|
|
r.result = Slice();
|
|
|
|
} else {
|
|
|
|
size_t len = std::min(
|
|
|
|
r.len, static_cast<size_t>(fs_r.result.size() - offset));
|
|
|
|
r.result = Slice(fs_r.scratch + offset, len);
|
|
|
|
}
|
Support direct IO in RandomAccessFileReader::MultiRead (#6446)
Summary:
By supporting direct IO in RandomAccessFileReader::MultiRead, the benefits of parallel IO (IO uring) and direct IO can be combined.
In direct IO mode, read requests are aligned and merged together before being issued to RandomAccessFile::MultiRead, so blocks in the original requests might share the same underlying buffer, the shared buffers are returned in `aligned_bufs`, which is a new parameter of the `MultiRead` API.
For example, suppose alignment requirement for direct IO is 4KB, one request is (offset: 1KB, len: 1KB), another request is (offset: 3KB, len: 1KB), then since they all belong to page (offset: 0, len: 4KB), `MultiRead` only reads the page with direct IO into a buffer on heap, and returns 2 Slices referencing regions in that same buffer. See `random_access_file_reader_test.cc` for more examples.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6446
Test Plan: Added a new test `random_access_file_reader_test.cc`.
Reviewed By: anand1976
Differential Revision: D20097518
Pulled By: cheng-chang
fbshipit-source-id: ca48a8faf9c3af146465c102ef6b266a363e78d1
2020-03-20 23:15:40 +00:00
|
|
|
} else {
|
|
|
|
r.result = Slice();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-09-16 17:31:27 +00:00
|
|
|
for (size_t i = 0; i < num_reqs; ++i) {
|
|
|
|
if (ShouldNotifyListeners()) {
|
2020-07-22 15:53:21 +00:00
|
|
|
auto finish_ts = FileOperationInfo::FinishNow();
|
2019-09-16 17:31:27 +00:00
|
|
|
NotifyOnFileReadFinish(read_reqs[i].offset, read_reqs[i].result.size(),
|
|
|
|
start_ts, finish_ts, read_reqs[i].status);
|
|
|
|
}
|
2021-11-19 01:09:54 +00:00
|
|
|
if (!read_reqs[i].status.ok()) {
|
|
|
|
NotifyOnIOError(read_reqs[i].status, FileOperationType::kRead,
|
|
|
|
file_name(), read_reqs[i].result.size(),
|
|
|
|
read_reqs[i].offset);
|
|
|
|
}
|
|
|
|
|
2022-02-18 21:35:36 +00:00
|
|
|
RecordIOStats(stats_, file_temperature_, is_last_level_,
|
|
|
|
read_reqs[i].result.size());
|
2019-09-16 17:31:27 +00:00
|
|
|
}
|
|
|
|
SetPerfLevel(prev_perf_level);
|
|
|
|
}
|
|
|
|
if (stats_ != nullptr && file_read_hist_ != nullptr) {
|
|
|
|
file_read_hist_->Add(elapsed);
|
|
|
|
}
|
|
|
|
|
2021-04-01 17:06:55 +00:00
|
|
|
return io_s;
|
2019-09-16 17:31:27 +00:00
|
|
|
}
|
Support direct IO in RandomAccessFileReader::MultiRead (#6446)
Summary:
By supporting direct IO in RandomAccessFileReader::MultiRead, the benefits of parallel IO (IO uring) and direct IO can be combined.
In direct IO mode, read requests are aligned and merged together before being issued to RandomAccessFile::MultiRead, so blocks in the original requests might share the same underlying buffer, the shared buffers are returned in `aligned_bufs`, which is a new parameter of the `MultiRead` API.
For example, suppose alignment requirement for direct IO is 4KB, one request is (offset: 1KB, len: 1KB), another request is (offset: 3KB, len: 1KB), then since they all belong to page (offset: 0, len: 4KB), `MultiRead` only reads the page with direct IO into a buffer on heap, and returns 2 Slices referencing regions in that same buffer. See `random_access_file_reader_test.cc` for more examples.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6446
Test Plan: Added a new test `random_access_file_reader_test.cc`.
Reviewed By: anand1976
Differential Revision: D20097518
Pulled By: cheng-chang
fbshipit-source-id: ca48a8faf9c3af146465c102ef6b266a363e78d1
2020-03-20 23:15:40 +00:00
|
|
|
|
2021-01-26 06:07:26 +00:00
|
|
|
IOStatus RandomAccessFileReader::PrepareIOOptions(const ReadOptions& ro,
|
2023-04-21 16:07:18 +00:00
|
|
|
IOOptions& opts) const {
|
2021-03-15 11:32:24 +00:00
|
|
|
if (clock_ != nullptr) {
|
2021-01-26 06:07:26 +00:00
|
|
|
return PrepareIOFromReadOptions(ro, clock_, opts);
|
|
|
|
} else {
|
2021-03-15 11:32:24 +00:00
|
|
|
return PrepareIOFromReadOptions(ro, SystemClock::Default().get(), opts);
|
2021-01-26 06:07:26 +00:00
|
|
|
}
|
|
|
|
}
|
2022-03-21 14:12:43 +00:00
|
|
|
|
|
|
|
IOStatus RandomAccessFileReader::ReadAsync(
|
|
|
|
FSReadRequest& req, const IOOptions& opts,
|
|
|
|
std::function<void(const FSReadRequest&, void*)> cb, void* cb_arg,
|
2022-07-06 18:42:59 +00:00
|
|
|
void** io_handle, IOHandleDeleter* del_fn, AlignedBuf* aligned_buf) {
|
|
|
|
IOStatus s;
|
2022-04-06 21:26:53 +00:00
|
|
|
// Create a callback and populate info.
|
|
|
|
auto read_async_callback =
|
|
|
|
std::bind(&RandomAccessFileReader::ReadAsyncCallback, this,
|
|
|
|
std::placeholders::_1, std::placeholders::_2);
|
2023-12-06 21:48:15 +00:00
|
|
|
|
|
|
|
ReadAsyncInfo* read_async_info = new ReadAsyncInfo(
|
|
|
|
cb, cb_arg, (clock_ != nullptr ? clock_->NowMicros() : 0));
|
2022-04-06 21:26:53 +00:00
|
|
|
|
|
|
|
if (ShouldNotifyListeners()) {
|
|
|
|
read_async_info->fs_start_ts_ = FileOperationInfo::StartNow();
|
|
|
|
}
|
|
|
|
|
2022-07-06 18:42:59 +00:00
|
|
|
size_t alignment = file_->GetRequiredBufferAlignment();
|
|
|
|
bool is_aligned = (req.offset & (alignment - 1)) == 0 &&
|
|
|
|
(req.len & (alignment - 1)) == 0 &&
|
|
|
|
(uintptr_t(req.scratch) & (alignment - 1)) == 0;
|
|
|
|
read_async_info->is_aligned_ = is_aligned;
|
|
|
|
|
2022-11-14 05:38:35 +00:00
|
|
|
uint64_t elapsed = 0;
|
2022-07-06 18:42:59 +00:00
|
|
|
if (use_direct_io() && is_aligned == false) {
|
|
|
|
FSReadRequest aligned_req = Align(req, alignment);
|
2022-09-13 00:42:01 +00:00
|
|
|
aligned_req.status.PermitUncheckedError();
|
2022-07-06 18:42:59 +00:00
|
|
|
|
|
|
|
// Allocate aligned buffer.
|
|
|
|
read_async_info->buf_.Alignment(alignment);
|
|
|
|
read_async_info->buf_.AllocateNewBuffer(aligned_req.len);
|
|
|
|
|
|
|
|
// Set rem fields in aligned FSReadRequest.
|
|
|
|
aligned_req.scratch = read_async_info->buf_.BufferStart();
|
|
|
|
|
|
|
|
// Set user provided fields to populate back in callback.
|
|
|
|
read_async_info->user_scratch_ = req.scratch;
|
|
|
|
read_async_info->user_aligned_buf_ = aligned_buf;
|
|
|
|
read_async_info->user_len_ = req.len;
|
|
|
|
read_async_info->user_offset_ = req.offset;
|
|
|
|
read_async_info->user_result_ = req.result;
|
|
|
|
|
|
|
|
assert(read_async_info->buf_.CurrentSize() == 0);
|
|
|
|
|
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444)
Summary:
**Context/Summary:**
- Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`.
- For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation)
- New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main.
- Misc
- More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority`
- Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444
Test Plan:
- CI fake db crash/stress test
- Microbenchmarking
**Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench`
- google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd
- db_basic_bench_base: upstream
- db_basic_bench_pr: db_basic_bench_base + this PR
- asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read)
- asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR
**Test**
Get
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000
```
Result
```
Coming soon
```
AsyncRead
```
TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out
```
Result
```
Base:
1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008,
PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before):
1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053,
```
Reviewed By: ajkr
Differential Revision: D45918925
Pulled By: hx235
fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
|
|
|
StopWatch sw(clock_, stats_, hist_type_,
|
|
|
|
GetFileReadHistograms(stats_, opts.io_activity),
|
|
|
|
(stats_ != nullptr) ? &elapsed : nullptr, true /*overwrite*/,
|
2023-04-21 16:07:18 +00:00
|
|
|
true /*delay_enabled*/);
|
2022-07-06 18:42:59 +00:00
|
|
|
s = file_->ReadAsync(aligned_req, opts, read_async_callback,
|
|
|
|
read_async_info, io_handle, del_fn, nullptr /*dbg*/);
|
|
|
|
} else {
|
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444)
Summary:
**Context/Summary:**
- Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`.
- For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation)
- New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main.
- Misc
- More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority`
- Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444
Test Plan:
- CI fake db crash/stress test
- Microbenchmarking
**Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench`
- google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd
- db_basic_bench_base: upstream
- db_basic_bench_pr: db_basic_bench_base + this PR
- asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read)
- asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR
**Test**
Get
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000
```
Result
```
Coming soon
```
AsyncRead
```
TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out
```
Result
```
Base:
1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008,
PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before):
1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053,
```
Reviewed By: ajkr
Differential Revision: D45918925
Pulled By: hx235
fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
|
|
|
StopWatch sw(clock_, stats_, hist_type_,
|
|
|
|
GetFileReadHistograms(stats_, opts.io_activity),
|
|
|
|
(stats_ != nullptr) ? &elapsed : nullptr, true /*overwrite*/,
|
2023-04-21 16:07:18 +00:00
|
|
|
true /*delay_enabled*/);
|
2022-07-06 18:42:59 +00:00
|
|
|
s = file_->ReadAsync(req, opts, read_async_callback, read_async_info,
|
|
|
|
io_handle, del_fn, nullptr /*dbg*/);
|
|
|
|
}
|
2022-11-14 05:38:35 +00:00
|
|
|
RecordTick(stats_, READ_ASYNC_MICROS, elapsed);
|
2022-07-06 18:42:59 +00:00
|
|
|
|
Provide support for IOTracing for ReadAsync API (#9833)
Summary:
Same as title
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9833
Test Plan:
Add unit test and manually check the output of tracing logs
For fixed readahead_size it logs as:
```
Access Time : 193352113447923 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 15075 , IO Status: OK, Length: 12288, Offset: 659456
Access Time : 193352113465232 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 14425 , IO Status: OK, Length: 12288, Offset: 671744
Access Time : 193352113481539 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13062 , IO Status: OK, Length: 12288, Offset: 684032
Access Time : 193352113497692 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13649 , IO Status: OK, Length: 12288, Offset: 696320
Access Time : 193352113520043 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 19384 , IO Status: OK, Length: 12288, Offset: 708608
Access Time : 193352113538401 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 15406 , IO Status: OK, Length: 12288, Offset: 720896
Access Time : 193352113554855 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13670 , IO Status: OK, Length: 12288, Offset: 733184
Access Time : 193352113571624 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13855 , IO Status: OK, Length: 12288, Offset: 745472
Access Time : 193352113587924 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13953 , IO Status: OK, Length: 12288, Offset: 757760
Access Time : 193352113603285 , File Name: 000026.sst , File Operation: Prefetch , Latency: 59 , IO Status: Not implemented: Prefetch not supported, Length: 8868, Offset: 898349
```
For implicit readahead:
```
Access Time : 193351865156587 , File Name: 000026.sst , File Operation: Prefetch , Latency: 48 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 391174
Access Time : 193351865160354 , File Name: 000026.sst , File Operation: Prefetch , Latency: 51 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 395248
Access Time : 193351865164253 , File Name: 000026.sst , File Operation: Prefetch , Latency: 49 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 399322
Access Time : 193351865165461 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 222871 , IO Status: OK, Length: 135168, Offset: 401408
```
Reviewed By: anand1976
Differential Revision: D35601634
Pulled By: akankshamahajan15
fbshipit-source-id: 5a4f32a850af878efa0767bd5706380152a1f26e
2022-05-26 02:47:03 +00:00
|
|
|
// Suppress false positive clang analyzer warnings.
|
|
|
|
// Memory is not released if file_->ReadAsync returns !s.ok(), because
|
|
|
|
// ReadAsyncCallback is never called in that case. If ReadAsyncCallback is
|
|
|
|
// called then ReadAsync should always return IOStatus::OK().
|
|
|
|
#ifndef __clang_analyzer__
|
2022-04-06 21:26:53 +00:00
|
|
|
if (!s.ok()) {
|
|
|
|
delete read_async_info;
|
|
|
|
}
|
Provide support for IOTracing for ReadAsync API (#9833)
Summary:
Same as title
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9833
Test Plan:
Add unit test and manually check the output of tracing logs
For fixed readahead_size it logs as:
```
Access Time : 193352113447923 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 15075 , IO Status: OK, Length: 12288, Offset: 659456
Access Time : 193352113465232 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 14425 , IO Status: OK, Length: 12288, Offset: 671744
Access Time : 193352113481539 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13062 , IO Status: OK, Length: 12288, Offset: 684032
Access Time : 193352113497692 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13649 , IO Status: OK, Length: 12288, Offset: 696320
Access Time : 193352113520043 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 19384 , IO Status: OK, Length: 12288, Offset: 708608
Access Time : 193352113538401 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 15406 , IO Status: OK, Length: 12288, Offset: 720896
Access Time : 193352113554855 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13670 , IO Status: OK, Length: 12288, Offset: 733184
Access Time : 193352113571624 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13855 , IO Status: OK, Length: 12288, Offset: 745472
Access Time : 193352113587924 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13953 , IO Status: OK, Length: 12288, Offset: 757760
Access Time : 193352113603285 , File Name: 000026.sst , File Operation: Prefetch , Latency: 59 , IO Status: Not implemented: Prefetch not supported, Length: 8868, Offset: 898349
```
For implicit readahead:
```
Access Time : 193351865156587 , File Name: 000026.sst , File Operation: Prefetch , Latency: 48 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 391174
Access Time : 193351865160354 , File Name: 000026.sst , File Operation: Prefetch , Latency: 51 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 395248
Access Time : 193351865164253 , File Name: 000026.sst , File Operation: Prefetch , Latency: 49 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 399322
Access Time : 193351865165461 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 222871 , IO Status: OK, Length: 135168, Offset: 401408
```
Reviewed By: anand1976
Differential Revision: D35601634
Pulled By: akankshamahajan15
fbshipit-source-id: 5a4f32a850af878efa0767bd5706380152a1f26e
2022-05-26 02:47:03 +00:00
|
|
|
#endif // __clang_analyzer__
|
|
|
|
|
2022-04-06 21:26:53 +00:00
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
void RandomAccessFileReader::ReadAsyncCallback(const FSReadRequest& req,
|
|
|
|
void* cb_arg) {
|
|
|
|
ReadAsyncInfo* read_async_info = static_cast<ReadAsyncInfo*>(cb_arg);
|
|
|
|
assert(read_async_info);
|
|
|
|
assert(read_async_info->cb_);
|
|
|
|
|
2022-07-06 18:42:59 +00:00
|
|
|
if (use_direct_io() && read_async_info->is_aligned_ == false) {
|
|
|
|
// Create FSReadRequest with user provided fields.
|
|
|
|
FSReadRequest user_req;
|
|
|
|
user_req.scratch = read_async_info->user_scratch_;
|
|
|
|
user_req.offset = read_async_info->user_offset_;
|
|
|
|
user_req.len = read_async_info->user_len_;
|
|
|
|
|
|
|
|
// Update results in user_req.
|
|
|
|
user_req.result = req.result;
|
|
|
|
user_req.status = req.status;
|
|
|
|
|
|
|
|
read_async_info->buf_.Size(read_async_info->buf_.CurrentSize() +
|
|
|
|
req.result.size());
|
|
|
|
|
|
|
|
size_t offset_advance_len = static_cast<size_t>(
|
|
|
|
/*offset_passed_by_user=*/read_async_info->user_offset_ -
|
|
|
|
/*aligned_offset=*/req.offset);
|
|
|
|
|
|
|
|
size_t res_len = 0;
|
|
|
|
if (req.status.ok() &&
|
|
|
|
offset_advance_len < read_async_info->buf_.CurrentSize()) {
|
|
|
|
res_len =
|
|
|
|
std::min(read_async_info->buf_.CurrentSize() - offset_advance_len,
|
|
|
|
read_async_info->user_len_);
|
|
|
|
if (read_async_info->user_aligned_buf_ == nullptr) {
|
|
|
|
// Copy the data into user's scratch.
|
|
|
|
// Clang analyzer assumes that it will take use_direct_io() == false in
|
|
|
|
// ReadAsync and use_direct_io() == true in Callback which cannot be true.
|
|
|
|
#ifndef __clang_analyzer__
|
|
|
|
read_async_info->buf_.Read(user_req.scratch, offset_advance_len,
|
|
|
|
res_len);
|
|
|
|
#endif // __clang_analyzer__
|
|
|
|
} else {
|
|
|
|
// Set aligned_buf provided by user without additional copy.
|
|
|
|
user_req.scratch =
|
|
|
|
read_async_info->buf_.BufferStart() + offset_advance_len;
|
|
|
|
read_async_info->user_aligned_buf_->reset(
|
|
|
|
read_async_info->buf_.Release());
|
|
|
|
}
|
|
|
|
user_req.result = Slice(user_req.scratch, res_len);
|
|
|
|
} else {
|
|
|
|
// Either req.status is not ok or data was not read.
|
|
|
|
user_req.result = Slice();
|
|
|
|
}
|
|
|
|
read_async_info->cb_(user_req, read_async_info->cb_arg_);
|
|
|
|
} else {
|
|
|
|
read_async_info->cb_(req, read_async_info->cb_arg_);
|
|
|
|
}
|
2022-04-06 21:26:53 +00:00
|
|
|
|
|
|
|
// Update stats and notify listeners.
|
|
|
|
if (stats_ != nullptr && file_read_hist_ != nullptr) {
|
|
|
|
// elapsed doesn't take into account delay and overwrite as StopWatch does
|
|
|
|
// in Read.
|
|
|
|
uint64_t elapsed = clock_->NowMicros() - read_async_info->start_time_;
|
|
|
|
file_read_hist_->Add(elapsed);
|
|
|
|
}
|
|
|
|
if (req.status.ok()) {
|
|
|
|
RecordInHistogram(stats_, ASYNC_READ_BYTES, req.result.size());
|
2022-11-14 05:38:35 +00:00
|
|
|
} else if (!req.status.IsAborted()) {
|
|
|
|
RecordTick(stats_, ASYNC_READ_ERROR_COUNT, 1);
|
2022-04-06 21:26:53 +00:00
|
|
|
}
|
|
|
|
if (ShouldNotifyListeners()) {
|
|
|
|
auto finish_ts = FileOperationInfo::FinishNow();
|
|
|
|
NotifyOnFileReadFinish(req.offset, req.result.size(),
|
|
|
|
read_async_info->fs_start_ts_, finish_ts,
|
|
|
|
req.status);
|
|
|
|
}
|
|
|
|
if (!req.status.ok()) {
|
|
|
|
NotifyOnIOError(req.status, FileOperationType::kRead, file_name(),
|
|
|
|
req.result.size(), req.offset);
|
|
|
|
}
|
|
|
|
RecordIOStats(stats_, file_temperature_, is_last_level_, req.result.size());
|
|
|
|
delete read_async_info;
|
2022-03-21 14:12:43 +00:00
|
|
|
}
|
2020-02-20 20:07:53 +00:00
|
|
|
} // namespace ROCKSDB_NAMESPACE
|