rocksdb/file/random_access_file_reader.cc

640 lines
24 KiB
C++
Raw Normal View History

// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "file/random_access_file_reader.h"
#include <algorithm>
#include <mutex>
#include "file/file_util.h"
#include "monitoring/histogram.h"
#include "monitoring/iostats_context_imp.h"
#include "port/port.h"
#include "table/format.h"
#include "test_util/sync_point.h"
#include "util/random.h"
#include "util/rate_limiter_impl.h"
namespace ROCKSDB_NAMESPACE {
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444) Summary: **Context/Summary:** - Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`. - For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation) - New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main. - Misc - More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority` - Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444 Test Plan: - CI fake db crash/stress test - Microbenchmarking **Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench` - google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd - db_basic_bench_base: upstream - db_basic_bench_pr: db_basic_bench_base + this PR - asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read) - asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR **Test** Get ``` TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000 ``` Result ``` Coming soon ``` AsyncRead ``` TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out ``` Result ``` Base: 1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008, PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before): 1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053, ``` Reviewed By: ajkr Differential Revision: D45918925 Pulled By: hx235 fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
inline Histograms GetFileReadHistograms(Statistics* stats,
Env::IOActivity io_activity) {
switch (io_activity) {
case Env::IOActivity::kFlush:
return Histograms::FILE_READ_FLUSH_MICROS;
case Env::IOActivity::kCompaction:
return Histograms::FILE_READ_COMPACTION_MICROS;
case Env::IOActivity::kDBOpen:
return Histograms::FILE_READ_DB_OPEN_MICROS;
default:
break;
}
if (stats && stats->get_stats_level() > StatsLevel::kExceptDetailedTimers) {
switch (io_activity) {
case Env::IOActivity::kGet:
return Histograms::FILE_READ_GET_MICROS;
case Env::IOActivity::kMultiGet:
return Histograms::FILE_READ_MULTIGET_MICROS;
case Env::IOActivity::kDBIterator:
return Histograms::FILE_READ_DB_ITERATOR_MICROS;
case Env::IOActivity::kVerifyDBChecksum:
return Histograms::FILE_READ_VERIFY_DB_CHECKSUM_MICROS;
case Env::IOActivity::kVerifyFileChecksums:
return Histograms::FILE_READ_VERIFY_FILE_CHECKSUMS_MICROS;
default:
break;
}
}
return Histograms::HISTOGRAM_ENUM_MAX;
}
inline void RecordIOStats(Statistics* stats, Temperature file_temperature,
bool is_last_level, size_t size) {
IOSTATS_ADD(bytes_read, size);
// record for last/non-last level
if (is_last_level) {
RecordTick(stats, LAST_LEVEL_READ_BYTES, size);
RecordTick(stats, LAST_LEVEL_READ_COUNT, 1);
} else {
RecordTick(stats, NON_LAST_LEVEL_READ_BYTES, size);
RecordTick(stats, NON_LAST_LEVEL_READ_COUNT, 1);
}
// record for temperature file
if (file_temperature != Temperature::kUnknown) {
switch (file_temperature) {
case Temperature::kHot:
IOSTATS_ADD(file_io_stats_by_temperature.hot_file_bytes_read, size);
IOSTATS_ADD(file_io_stats_by_temperature.hot_file_read_count, 1);
RecordTick(stats, HOT_FILE_READ_BYTES, size);
RecordTick(stats, HOT_FILE_READ_COUNT, 1);
break;
case Temperature::kWarm:
IOSTATS_ADD(file_io_stats_by_temperature.warm_file_bytes_read, size);
IOSTATS_ADD(file_io_stats_by_temperature.warm_file_read_count, 1);
RecordTick(stats, WARM_FILE_READ_BYTES, size);
RecordTick(stats, WARM_FILE_READ_COUNT, 1);
break;
case Temperature::kCold:
IOSTATS_ADD(file_io_stats_by_temperature.cold_file_bytes_read, size);
IOSTATS_ADD(file_io_stats_by_temperature.cold_file_read_count, 1);
RecordTick(stats, COLD_FILE_READ_BYTES, size);
RecordTick(stats, COLD_FILE_READ_COUNT, 1);
break;
default:
break;
}
}
}
IOStatus RandomAccessFileReader::Create(
const std::shared_ptr<FileSystem>& fs, const std::string& fname,
const FileOptions& file_opts,
std::unique_ptr<RandomAccessFileReader>* reader, IODebugContext* dbg) {
std::unique_ptr<FSRandomAccessFile> file;
IOStatus io_s = fs->NewRandomAccessFile(fname, file_opts, &file, dbg);
if (io_s.ok()) {
reader->reset(new RandomAccessFileReader(std::move(file), fname));
}
return io_s;
}
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444) Summary: **Context/Summary:** - Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`. - For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation) - New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main. - Misc - More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority` - Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444 Test Plan: - CI fake db crash/stress test - Microbenchmarking **Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench` - google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd - db_basic_bench_base: upstream - db_basic_bench_pr: db_basic_bench_base + this PR - asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read) - asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR **Test** Get ``` TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000 ``` Result ``` Coming soon ``` AsyncRead ``` TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out ``` Result ``` Base: 1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008, PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before): 1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053, ``` Reviewed By: ajkr Differential Revision: D45918925 Pulled By: hx235 fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
IOStatus RandomAccessFileReader::Read(const IOOptions& opts, uint64_t offset,
size_t n, Slice* result, char* scratch,
AlignedBuf* aligned_buf) const {
(void)aligned_buf;
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444) Summary: **Context/Summary:** - Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`. - For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation) - New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main. - Misc - More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority` - Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444 Test Plan: - CI fake db crash/stress test - Microbenchmarking **Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench` - google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd - db_basic_bench_base: upstream - db_basic_bench_pr: db_basic_bench_base + this PR - asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read) - asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR **Test** Get ``` TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000 ``` Result ``` Coming soon ``` AsyncRead ``` TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out ``` Result ``` Base: 1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008, PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before): 1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053, ``` Reviewed By: ajkr Differential Revision: D45918925 Pulled By: hx235 fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
const Env::IOPriority rate_limiter_priority = opts.rate_limiter_priority;
TEST_SYNC_POINT_CALLBACK("RandomAccessFileReader::Read", nullptr);
// To be paranoid: modify scratch a little bit, so in case underlying
// FileSystem doesn't fill the buffer but return success and `scratch` returns
// contains a previous block, returned value will not pass checksum.
if (n > 0 && scratch != nullptr) {
// This byte might not change anything for direct I/O case, but it's OK.
scratch[0]++;
}
IOStatus io_s;
uint64_t elapsed = 0;
Remove reallocation of AlignedBuffer in direct_io sync reads if already aligned (#11600) Summary: Remove reallocation of AlignedBuffer in direct_io sync reads in RandomAccessFileReader::Read if buffer passed is already aligned. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11600 Test Plan: Setup: `TEST_TMPDIR=./tmp-db/ ./db_bench -benchmarks=filluniquerandom -disable_auto_compactions=true -target_file_size_base=1048576 -write_buffer_size=1048576 -compression_type=none` Benchmark: `TEST_TMPDIR=./tmp-db/ perf record ./db_bench --cache_size=8388608 --use_existing_db=true --disable_auto_compactions=true --benchmarks=seekrandom --use_direct_reads=true -use_direct_io_for_flush_and_compaction=true -reads=1000 -seek_nexts=1 -max_auto_readahead_size=131072 -initial_auto_readahead_size=16384 -adaptive_readahead=true -num_file_reads_for_auto_readahead=0` Perf profile- Before: ``` 8.73% db_bench libc.so.6 [.] __memmove_evex_unaligned_erms 3.34% db_bench [kernel.vmlinux] [k] filemap_get_read_batch ``` After: ``` 2.50% db_bench [kernel.vmlinux] [k] filemap_get_read_batch 2.29% db_bench libc.so.6 [.] __memmove_evex_unaligned_erms ``` `make crash_test -j `with direct_io enabled completed succesfully locally. Ran few benchmarks with direct_io from seek_nexts varying between 912 to 327680 and different readahead_size parameters and it showed no regression so far. Reviewed By: ajkr Differential Revision: D47478598 Pulled By: akankshamahajan15 fbshipit-source-id: 6a48e21cb34696f5d09c22a6311a3a1cb5f9cf33
2023-07-15 03:08:05 +00:00
size_t alignment = file_->GetRequiredBufferAlignment();
bool is_aligned = false;
if (scratch != nullptr) {
// Check if offset, length and buffer are aligned.
is_aligned = (offset & (alignment - 1)) == 0 &&
(n & (alignment - 1)) == 0 &&
(uintptr_t(scratch) & (alignment - 1)) == 0;
}
{
StopWatch sw(clock_, stats_, hist_type_,
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444) Summary: **Context/Summary:** - Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`. - For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation) - New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main. - Misc - More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority` - Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444 Test Plan: - CI fake db crash/stress test - Microbenchmarking **Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench` - google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd - db_basic_bench_base: upstream - db_basic_bench_pr: db_basic_bench_base + this PR - asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read) - asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR **Test** Get ``` TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000 ``` Result ``` Coming soon ``` AsyncRead ``` TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out ``` Result ``` Base: 1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008, PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before): 1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053, ``` Reviewed By: ajkr Differential Revision: D45918925 Pulled By: hx235 fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
GetFileReadHistograms(stats_, opts.io_activity),
(stats_ != nullptr) ? &elapsed : nullptr, true /*overwrite*/,
true /*delay_enabled*/);
auto prev_perf_level = GetPerfLevel();
IOSTATS_TIMER_GUARD(read_nanos);
Remove reallocation of AlignedBuffer in direct_io sync reads if already aligned (#11600) Summary: Remove reallocation of AlignedBuffer in direct_io sync reads in RandomAccessFileReader::Read if buffer passed is already aligned. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11600 Test Plan: Setup: `TEST_TMPDIR=./tmp-db/ ./db_bench -benchmarks=filluniquerandom -disable_auto_compactions=true -target_file_size_base=1048576 -write_buffer_size=1048576 -compression_type=none` Benchmark: `TEST_TMPDIR=./tmp-db/ perf record ./db_bench --cache_size=8388608 --use_existing_db=true --disable_auto_compactions=true --benchmarks=seekrandom --use_direct_reads=true -use_direct_io_for_flush_and_compaction=true -reads=1000 -seek_nexts=1 -max_auto_readahead_size=131072 -initial_auto_readahead_size=16384 -adaptive_readahead=true -num_file_reads_for_auto_readahead=0` Perf profile- Before: ``` 8.73% db_bench libc.so.6 [.] __memmove_evex_unaligned_erms 3.34% db_bench [kernel.vmlinux] [k] filemap_get_read_batch ``` After: ``` 2.50% db_bench [kernel.vmlinux] [k] filemap_get_read_batch 2.29% db_bench libc.so.6 [.] __memmove_evex_unaligned_erms ``` `make crash_test -j `with direct_io enabled completed succesfully locally. Ran few benchmarks with direct_io from seek_nexts varying between 912 to 327680 and different readahead_size parameters and it showed no regression so far. Reviewed By: ajkr Differential Revision: D47478598 Pulled By: akankshamahajan15 fbshipit-source-id: 6a48e21cb34696f5d09c22a6311a3a1cb5f9cf33
2023-07-15 03:08:05 +00:00
if (use_direct_io() && is_aligned == false) {
size_t aligned_offset =
TruncateToPageBoundary(alignment, static_cast<size_t>(offset));
size_t offset_advance = static_cast<size_t>(offset) - aligned_offset;
size_t read_size =
Roundup(static_cast<size_t>(offset + n), alignment) - aligned_offset;
AlignedBuffer buf;
buf.Alignment(alignment);
buf.AllocateNewBuffer(read_size);
while (buf.CurrentSize() < read_size) {
size_t allowed;
Add rate limiter priority to ReadOptions (#9424) Summary: Users can set the priority for file reads associated with their operation by setting `ReadOptions::rate_limiter_priority` to something other than `Env::IO_TOTAL`. Rate limiting `VerifyChecksum()` and `VerifyFileChecksums()` is the motivation for this PR, so it also includes benchmarks and minor bug fixes to get that working. `RandomAccessFileReader::Read()` already had support for rate limiting compaction reads. I changed that rate limiting to be non-specific to compaction, but rather performed according to the passed in `Env::IOPriority`. Now the compaction read rate limiting is supported by setting `rate_limiter_priority = Env::IO_LOW` on its `ReadOptions`. There is no default value for the new `Env::IOPriority` parameter to `RandomAccessFileReader::Read()`. That means this PR goes through all callers (in some cases multiple layers up the call stack) to find a `ReadOptions` to provide the priority. There are TODOs for cases I believe it would be good to let user control the priority some day (e.g., file footer reads), and no TODO in cases I believe it doesn't matter (e.g., trace file reads). The API doc only lists the missing cases where a file read associated with a provided `ReadOptions` cannot be rate limited. For cases like file ingestion checksum calculation, there is no API to provide `ReadOptions` or `Env::IOPriority`, so I didn't count that as missing. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9424 Test Plan: - new unit tests - new benchmarks on ~50MB database with 1MB/s read rate limit and 100ms refill interval; verified with strace reads are chunked (at 0.1MB per chunk) and spaced roughly 100ms apart. - setup command: `./db_bench -benchmarks=fillrandom,compact -db=/tmp/testdb -target_file_size_base=1048576 -disable_auto_compactions=true -file_checksum=true` - benchmarks command: `strace -ttfe pread64 ./db_bench -benchmarks=verifychecksum,verifyfilechecksums -use_existing_db=true -db=/tmp/testdb -rate_limiter_bytes_per_sec=1048576 -rate_limit_bg_reads=1 -rate_limit_user_ops=true -file_checksum=true` - crash test using IO_USER priority on non-validation reads with https://github.com/facebook/rocksdb/issues/9567 reverted: `python3 tools/db_crashtest.py blackbox --max_key=1000000 --write_buffer_size=524288 --target_file_size_base=524288 --level_compaction_dynamic_level_bytes=true --duration=3600 --rate_limit_bg_reads=true --rate_limit_user_ops=true --rate_limiter_bytes_per_sec=10485760 --interval=10` Reviewed By: hx235 Differential Revision: D33747386 Pulled By: ajkr fbshipit-source-id: a2d985e97912fba8c54763798e04f006ccc56e0c
2022-02-17 07:17:03 +00:00
if (rate_limiter_priority != Env::IO_TOTAL &&
rate_limiter_ != nullptr) {
allowed = rate_limiter_->RequestToken(
buf.Capacity() - buf.CurrentSize(), buf.Alignment(),
Add rate limiter priority to ReadOptions (#9424) Summary: Users can set the priority for file reads associated with their operation by setting `ReadOptions::rate_limiter_priority` to something other than `Env::IO_TOTAL`. Rate limiting `VerifyChecksum()` and `VerifyFileChecksums()` is the motivation for this PR, so it also includes benchmarks and minor bug fixes to get that working. `RandomAccessFileReader::Read()` already had support for rate limiting compaction reads. I changed that rate limiting to be non-specific to compaction, but rather performed according to the passed in `Env::IOPriority`. Now the compaction read rate limiting is supported by setting `rate_limiter_priority = Env::IO_LOW` on its `ReadOptions`. There is no default value for the new `Env::IOPriority` parameter to `RandomAccessFileReader::Read()`. That means this PR goes through all callers (in some cases multiple layers up the call stack) to find a `ReadOptions` to provide the priority. There are TODOs for cases I believe it would be good to let user control the priority some day (e.g., file footer reads), and no TODO in cases I believe it doesn't matter (e.g., trace file reads). The API doc only lists the missing cases where a file read associated with a provided `ReadOptions` cannot be rate limited. For cases like file ingestion checksum calculation, there is no API to provide `ReadOptions` or `Env::IOPriority`, so I didn't count that as missing. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9424 Test Plan: - new unit tests - new benchmarks on ~50MB database with 1MB/s read rate limit and 100ms refill interval; verified with strace reads are chunked (at 0.1MB per chunk) and spaced roughly 100ms apart. - setup command: `./db_bench -benchmarks=fillrandom,compact -db=/tmp/testdb -target_file_size_base=1048576 -disable_auto_compactions=true -file_checksum=true` - benchmarks command: `strace -ttfe pread64 ./db_bench -benchmarks=verifychecksum,verifyfilechecksums -use_existing_db=true -db=/tmp/testdb -rate_limiter_bytes_per_sec=1048576 -rate_limit_bg_reads=1 -rate_limit_user_ops=true -file_checksum=true` - crash test using IO_USER priority on non-validation reads with https://github.com/facebook/rocksdb/issues/9567 reverted: `python3 tools/db_crashtest.py blackbox --max_key=1000000 --write_buffer_size=524288 --target_file_size_base=524288 --level_compaction_dynamic_level_bytes=true --duration=3600 --rate_limit_bg_reads=true --rate_limit_user_ops=true --rate_limiter_bytes_per_sec=10485760 --interval=10` Reviewed By: hx235 Differential Revision: D33747386 Pulled By: ajkr fbshipit-source-id: a2d985e97912fba8c54763798e04f006ccc56e0c
2022-02-17 07:17:03 +00:00
rate_limiter_priority, stats_, RateLimiter::OpType::kRead);
} else {
assert(buf.CurrentSize() == 0);
allowed = read_size;
}
Slice tmp;
FileOperationInfo::StartTimePoint start_ts;
uint64_t orig_offset = 0;
if (ShouldNotifyListeners()) {
start_ts = FileOperationInfo::StartNow();
orig_offset = aligned_offset + buf.CurrentSize();
}
{
IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, clock_);
// Only user reads are expected to specify a timeout. And user reads
// are not subjected to rate_limiter and should go through only
// one iteration of this loop, so we don't need to check and adjust
// the opts.timeout before calling file_->Read
assert(!opts.timeout.count() || allowed == read_size);
io_s = file_->Read(aligned_offset + buf.CurrentSize(), allowed, opts,
&tmp, buf.Destination(), nullptr);
}
if (ShouldNotifyListeners()) {
auto finish_ts = FileOperationInfo::FinishNow();
NotifyOnFileReadFinish(orig_offset, tmp.size(), start_ts, finish_ts,
io_s);
if (!io_s.ok()) {
NotifyOnIOError(io_s, FileOperationType::kRead, file_name(),
tmp.size(), orig_offset);
}
}
buf.Size(buf.CurrentSize() + tmp.size());
if (!io_s.ok() || tmp.size() < allowed) {
break;
}
}
size_t res_len = 0;
if (io_s.ok() && offset_advance < buf.CurrentSize()) {
res_len = std::min(buf.CurrentSize() - offset_advance, n);
if (aligned_buf == nullptr) {
buf.Read(scratch, offset_advance, res_len);
} else {
scratch = buf.BufferStart() + offset_advance;
aligned_buf->reset(buf.Release());
}
}
*result = Slice(scratch, res_len);
} else {
size_t pos = 0;
const char* res_scratch = nullptr;
while (pos < n) {
size_t allowed;
Add rate limiter priority to ReadOptions (#9424) Summary: Users can set the priority for file reads associated with their operation by setting `ReadOptions::rate_limiter_priority` to something other than `Env::IO_TOTAL`. Rate limiting `VerifyChecksum()` and `VerifyFileChecksums()` is the motivation for this PR, so it also includes benchmarks and minor bug fixes to get that working. `RandomAccessFileReader::Read()` already had support for rate limiting compaction reads. I changed that rate limiting to be non-specific to compaction, but rather performed according to the passed in `Env::IOPriority`. Now the compaction read rate limiting is supported by setting `rate_limiter_priority = Env::IO_LOW` on its `ReadOptions`. There is no default value for the new `Env::IOPriority` parameter to `RandomAccessFileReader::Read()`. That means this PR goes through all callers (in some cases multiple layers up the call stack) to find a `ReadOptions` to provide the priority. There are TODOs for cases I believe it would be good to let user control the priority some day (e.g., file footer reads), and no TODO in cases I believe it doesn't matter (e.g., trace file reads). The API doc only lists the missing cases where a file read associated with a provided `ReadOptions` cannot be rate limited. For cases like file ingestion checksum calculation, there is no API to provide `ReadOptions` or `Env::IOPriority`, so I didn't count that as missing. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9424 Test Plan: - new unit tests - new benchmarks on ~50MB database with 1MB/s read rate limit and 100ms refill interval; verified with strace reads are chunked (at 0.1MB per chunk) and spaced roughly 100ms apart. - setup command: `./db_bench -benchmarks=fillrandom,compact -db=/tmp/testdb -target_file_size_base=1048576 -disable_auto_compactions=true -file_checksum=true` - benchmarks command: `strace -ttfe pread64 ./db_bench -benchmarks=verifychecksum,verifyfilechecksums -use_existing_db=true -db=/tmp/testdb -rate_limiter_bytes_per_sec=1048576 -rate_limit_bg_reads=1 -rate_limit_user_ops=true -file_checksum=true` - crash test using IO_USER priority on non-validation reads with https://github.com/facebook/rocksdb/issues/9567 reverted: `python3 tools/db_crashtest.py blackbox --max_key=1000000 --write_buffer_size=524288 --target_file_size_base=524288 --level_compaction_dynamic_level_bytes=true --duration=3600 --rate_limit_bg_reads=true --rate_limit_user_ops=true --rate_limiter_bytes_per_sec=10485760 --interval=10` Reviewed By: hx235 Differential Revision: D33747386 Pulled By: ajkr fbshipit-source-id: a2d985e97912fba8c54763798e04f006ccc56e0c
2022-02-17 07:17:03 +00:00
if (rate_limiter_priority != Env::IO_TOTAL &&
rate_limiter_ != nullptr) {
if (rate_limiter_->IsRateLimited(RateLimiter::OpType::kRead)) {
sw.DelayStart();
}
Remove reallocation of AlignedBuffer in direct_io sync reads if already aligned (#11600) Summary: Remove reallocation of AlignedBuffer in direct_io sync reads in RandomAccessFileReader::Read if buffer passed is already aligned. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11600 Test Plan: Setup: `TEST_TMPDIR=./tmp-db/ ./db_bench -benchmarks=filluniquerandom -disable_auto_compactions=true -target_file_size_base=1048576 -write_buffer_size=1048576 -compression_type=none` Benchmark: `TEST_TMPDIR=./tmp-db/ perf record ./db_bench --cache_size=8388608 --use_existing_db=true --disable_auto_compactions=true --benchmarks=seekrandom --use_direct_reads=true -use_direct_io_for_flush_and_compaction=true -reads=1000 -seek_nexts=1 -max_auto_readahead_size=131072 -initial_auto_readahead_size=16384 -adaptive_readahead=true -num_file_reads_for_auto_readahead=0` Perf profile- Before: ``` 8.73% db_bench libc.so.6 [.] __memmove_evex_unaligned_erms 3.34% db_bench [kernel.vmlinux] [k] filemap_get_read_batch ``` After: ``` 2.50% db_bench [kernel.vmlinux] [k] filemap_get_read_batch 2.29% db_bench libc.so.6 [.] __memmove_evex_unaligned_erms ``` `make crash_test -j `with direct_io enabled completed succesfully locally. Ran few benchmarks with direct_io from seek_nexts varying between 912 to 327680 and different readahead_size parameters and it showed no regression so far. Reviewed By: ajkr Differential Revision: D47478598 Pulled By: akankshamahajan15 fbshipit-source-id: 6a48e21cb34696f5d09c22a6311a3a1cb5f9cf33
2023-07-15 03:08:05 +00:00
allowed = rate_limiter_->RequestToken(
n - pos, (use_direct_io() ? alignment : 0), rate_limiter_priority,
stats_, RateLimiter::OpType::kRead);
if (rate_limiter_->IsRateLimited(RateLimiter::OpType::kRead)) {
sw.DelayStop();
}
} else {
allowed = n;
}
Slice tmp_result;
FileOperationInfo::StartTimePoint start_ts;
if (ShouldNotifyListeners()) {
start_ts = FileOperationInfo::StartNow();
}
{
IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, clock_);
// Only user reads are expected to specify a timeout. And user reads
// are not subjected to rate_limiter and should go through only
// one iteration of this loop, so we don't need to check and adjust
// the opts.timeout before calling file_->Read
assert(!opts.timeout.count() || allowed == n);
io_s = file_->Read(offset + pos, allowed, opts, &tmp_result,
scratch + pos, nullptr);
}
if (ShouldNotifyListeners()) {
auto finish_ts = FileOperationInfo::FinishNow();
NotifyOnFileReadFinish(offset + pos, tmp_result.size(), start_ts,
finish_ts, io_s);
if (!io_s.ok()) {
NotifyOnIOError(io_s, FileOperationType::kRead, file_name(),
tmp_result.size(), offset + pos);
}
}
if (res_scratch == nullptr) {
// we can't simply use `scratch` because reads of mmap'd files return
// data in a different buffer.
res_scratch = tmp_result.data();
} else {
// make sure chunks are inserted contiguously into `res_scratch`.
assert(tmp_result.data() == res_scratch + pos);
}
pos += tmp_result.size();
if (!io_s.ok() || tmp_result.size() < allowed) {
break;
}
}
*result = Slice(res_scratch, io_s.ok() ? pos : 0);
}
RecordIOStats(stats_, file_temperature_, is_last_level_, result->size());
SetPerfLevel(prev_perf_level);
}
if (stats_ != nullptr && file_read_hist_ != nullptr) {
file_read_hist_->Add(elapsed);
}
#ifndef NDEBUG
auto pair = std::make_pair(&file_name_, &io_s);
if (offset == 0) {
TEST_SYNC_POINT_CALLBACK("RandomAccessFileReader::Read::BeforeReturn",
&pair);
}
TEST_SYNC_POINT_CALLBACK("RandomAccessFileReader::Read::AnyOffset", &pair);
#endif
return io_s;
}
size_t End(const FSReadRequest& r) {
return static_cast<size_t>(r.offset) + r.len;
}
FSReadRequest Align(const FSReadRequest& r, size_t alignment) {
FSReadRequest req;
req.offset = static_cast<uint64_t>(
TruncateToPageBoundary(alignment, static_cast<size_t>(r.offset)));
req.len = Roundup(End(r), alignment) - req.offset;
req.scratch = nullptr;
return req;
}
bool TryMerge(FSReadRequest* dest, const FSReadRequest& src) {
size_t dest_offset = static_cast<size_t>(dest->offset);
size_t src_offset = static_cast<size_t>(src.offset);
size_t dest_end = End(*dest);
size_t src_end = End(src);
if (std::max(dest_offset, src_offset) > std::min(dest_end, src_end)) {
return false;
}
dest->offset = static_cast<uint64_t>(std::min(dest_offset, src_offset));
dest->len = std::max(dest_end, src_end) - dest->offset;
return true;
}
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444) Summary: **Context/Summary:** - Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`. - For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation) - New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main. - Misc - More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority` - Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444 Test Plan: - CI fake db crash/stress test - Microbenchmarking **Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench` - google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd - db_basic_bench_base: upstream - db_basic_bench_pr: db_basic_bench_base + this PR - asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read) - asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR **Test** Get ``` TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000 ``` Result ``` Coming soon ``` AsyncRead ``` TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out ``` Result ``` Base: 1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008, PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before): 1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053, ``` Reviewed By: ajkr Differential Revision: D45918925 Pulled By: hx235 fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
IOStatus RandomAccessFileReader::MultiRead(const IOOptions& opts,
FSReadRequest* read_reqs,
size_t num_reqs,
AlignedBuf* aligned_buf) const {
(void)aligned_buf; // suppress warning of unused variable in LITE mode
assert(num_reqs > 0);
#ifndef NDEBUG
for (size_t i = 0; i < num_reqs - 1; ++i) {
assert(read_reqs[i].offset <= read_reqs[i + 1].offset);
}
#endif // !NDEBUG
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444) Summary: **Context/Summary:** - Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`. - For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation) - New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main. - Misc - More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority` - Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444 Test Plan: - CI fake db crash/stress test - Microbenchmarking **Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench` - google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd - db_basic_bench_base: upstream - db_basic_bench_pr: db_basic_bench_base + this PR - asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read) - asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR **Test** Get ``` TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000 ``` Result ``` Coming soon ``` AsyncRead ``` TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out ``` Result ``` Base: 1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008, PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before): 1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053, ``` Reviewed By: ajkr Differential Revision: D45918925 Pulled By: hx235 fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
const Env::IOPriority rate_limiter_priority = opts.rate_limiter_priority;
// To be paranoid modify scratch a little bit, so in case underlying
Support read rate-limiting in SequentialFileReader (#9973) Summary: Added rate limiter and read rate-limiting support to SequentialFileReader. I've updated call sites to SequentialFileReader::Read with appropriate IO priority (or left a TODO and specified IO_TOTAL for now). The PR is separated into four commits: the first one added the rate-limiting support, but with some fixes in the unit test since the number of request bytes from rate limiter in SequentialFileReader are not accurate (there is overcharge at EOF). The second commit fixed this by allowing SequentialFileReader to check file size and determine how many bytes are left in the file to read. The third commit added benchmark related code. The fourth commit moved the logic of using file size to avoid overcharging the rate limiter into backup engine (the main user of SequentialFileReader). Pull Request resolved: https://github.com/facebook/rocksdb/pull/9973 Test Plan: - `make check`, backup_engine_test covers usage of SequentialFileReader with rate limiter. - Run db_bench to check if rate limiting is throttling as expected: Verified that reads and writes are together throttled at 2MB/s, and at 0.2MB chunks that are 100ms apart. - Set up: `./db_bench --benchmarks=fillrandom -db=/dev/shm/test_rocksdb` - Benchmark: ``` strace -ttfe read,write ./db_bench --benchmarks=backup -db=/dev/shm/test_rocksdb --backup_rate_limit=2097152 --use_existing_db strace -ttfe read,write ./db_bench --benchmarks=restore -db=/dev/shm/test_rocksdb --restore_rate_limit=2097152 --use_existing_db ``` - db bench on backup and restore to ensure no performance regression. - backup (avg over 50 runs): pre-change: 1.90443e+06 micros/op; post-change: 1.8993e+06 micros/op (improve by 0.2%) - restore (avg over 50 runs): pre-change: 1.79105e+06 micros/op; post-change: 1.78192e+06 micros/op (improve by 0.5%) ``` # Set up ./db_bench --benchmarks=fillrandom -db=/tmp/test_rocksdb -num=10000000 # benchmark TEST_TMPDIR=/tmp/test_rocksdb NUM_RUN=50 for ((j=0;j<$NUM_RUN;j++)) do ./db_bench -db=$TEST_TMPDIR -num=10000000 -benchmarks=backup -use_existing_db | egrep 'backup' # Restore #./db_bench -db=$TEST_TMPDIR -num=10000000 -benchmarks=restore -use_existing_db done > rate_limit.txt && awk -v NUM_RUN=$NUM_RUN '{sum+=$3;sum_sqrt+=$3^2}END{print sum/NUM_RUN, sqrt(sum_sqrt/NUM_RUN-(sum/NUM_RUN)^2)}' rate_limit.txt >> rate_limit_2.txt ``` Reviewed By: hx235 Differential Revision: D36327418 Pulled By: cbi42 fbshipit-source-id: e75d4307cff815945482df5ba630c1e88d064691
2022-05-24 17:28:57 +00:00
// FileSystem doesn't fill the buffer but return success and `scratch` returns
// contains a previous block, returned value will not pass checksum.
// This byte might not change anything for direct I/O case, but it's OK.
for (size_t i = 0; i < num_reqs; i++) {
FSReadRequest& r = read_reqs[i];
if (r.len > 0 && r.scratch != nullptr) {
r.scratch[0]++;
}
}
IOStatus io_s;
uint64_t elapsed = 0;
{
StopWatch sw(clock_, stats_, hist_type_,
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444) Summary: **Context/Summary:** - Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`. - For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation) - New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main. - Misc - More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority` - Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444 Test Plan: - CI fake db crash/stress test - Microbenchmarking **Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench` - google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd - db_basic_bench_base: upstream - db_basic_bench_pr: db_basic_bench_base + this PR - asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read) - asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR **Test** Get ``` TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000 ``` Result ``` Coming soon ``` AsyncRead ``` TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out ``` Result ``` Base: 1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008, PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before): 1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053, ``` Reviewed By: ajkr Differential Revision: D45918925 Pulled By: hx235 fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
GetFileReadHistograms(stats_, opts.io_activity),
(stats_ != nullptr) ? &elapsed : nullptr, true /*overwrite*/,
true /*delay_enabled*/);
auto prev_perf_level = GetPerfLevel();
IOSTATS_TIMER_GUARD(read_nanos);
FSReadRequest* fs_reqs = read_reqs;
size_t num_fs_reqs = num_reqs;
std::vector<FSReadRequest> aligned_reqs;
if (use_direct_io()) {
// num_reqs is the max possible size,
// this can reduce std::vecector's internal resize operations.
aligned_reqs.reserve(num_reqs);
// Align and merge the read requests.
size_t alignment = file_->GetRequiredBufferAlignment();
for (size_t i = 0; i < num_reqs; i++) {
FSReadRequest r = Align(read_reqs[i], alignment);
if (i == 0) {
// head
aligned_reqs.push_back(std::move(r));
} else if (!TryMerge(&aligned_reqs.back(), r)) {
// head + n
aligned_reqs.push_back(std::move(r));
} else {
// unused
r.status.PermitUncheckedError();
}
}
TEST_SYNC_POINT_CALLBACK("RandomAccessFileReader::MultiRead:AlignedReqs",
&aligned_reqs);
// Allocate aligned buffer and let scratch buffers point to it.
size_t total_len = 0;
for (const auto& r : aligned_reqs) {
total_len += r.len;
}
AlignedBuffer buf;
buf.Alignment(alignment);
buf.AllocateNewBuffer(total_len);
char* scratch = buf.BufferStart();
for (auto& r : aligned_reqs) {
r.scratch = scratch;
scratch += r.len;
}
aligned_buf->reset(buf.Release());
fs_reqs = aligned_reqs.data();
num_fs_reqs = aligned_reqs.size();
}
FileOperationInfo::StartTimePoint start_ts;
if (ShouldNotifyListeners()) {
start_ts = FileOperationInfo::StartNow();
}
{
IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, clock_);
Add rate-limiting support to batched MultiGet() (#10159) Summary: **Context/Summary:** https://github.com/facebook/rocksdb/pull/9424 added rate-limiting support for user reads, which does not include batched `MultiGet()`s that call `RandomAccessFileReader::MultiRead()`. The reason is that it's harder (compared with RandomAccessFileReader::Read()) to implement the ideal rate-limiting where we first call `RateLimiter::RequestToken()` for allowed bytes to multi-read and then consume those bytes by satisfying as many requests in `MultiRead()` as possible. For example, it can be tricky to decide whether we want partially fulfilled requests within one `MultiRead()` or not. However, due to a recent urgent user request, we decide to pursue an elementary (but a conditionally ineffective) solution where we accumulate enough rate limiter requests toward the total bytes needed by one `MultiRead()` before doing that `MultiRead()`. This is not ideal when the total bytes are huge as we will actually consume a huge bandwidth from rate-limiter causing a burst on disk. This is not what we ultimately want with rate limiter. Therefore a follow-up work is noted through TODO comments. Pull Request resolved: https://github.com/facebook/rocksdb/pull/10159 Test Plan: - Modified existing unit test `DBRateLimiterOnReadTest/DBRateLimiterOnReadTest.NewMultiGet` - Traced the underlying system calls `io_uring_enter` and verified they are 10 seconds apart from each other correctly under the setting of `strace -ftt -e trace=io_uring_enter ./db_bench -benchmarks=multireadrandom -db=/dev/shm/testdb2 -readonly -num=50 -threads=1 -multiread_batched=1 -batch_size=100 -duration=10 -rate_limiter_bytes_per_sec=200 -rate_limiter_refill_period_us=1000000 -rate_limit_bg_reads=1 -disable_auto_compactions=1 -rate_limit_user_ops=1` where each `MultiRead()` read about 2000 bytes (inspected by debugger) and the rate limiter grants 200 bytes per seconds. - Stress test: - Verified `./db_stress (-test_cf_consistency=1/test_batches_snapshots=1) -use_multiget=1 -cache_size=1048576 -rate_limiter_bytes_per_sec=10241024 -rate_limit_bg_reads=1 -rate_limit_user_ops=1` work Reviewed By: ajkr, anand1976 Differential Revision: D37135172 Pulled By: hx235 fbshipit-source-id: 73b8e8f14761e5d4b77235dfe5d41f4eea968bcd
2022-06-17 23:40:47 +00:00
if (rate_limiter_priority != Env::IO_TOTAL && rate_limiter_ != nullptr) {
// TODO: ideally we should call `RateLimiter::RequestToken()` for
// allowed bytes to multi-read and then consume those bytes by
// satisfying as many requests in `MultiRead()` as possible, instead of
// what we do here, which can cause burst when the
// `total_multi_read_size` is big.
size_t total_multi_read_size = 0;
assert(fs_reqs != nullptr);
for (size_t i = 0; i < num_fs_reqs; ++i) {
FSReadRequest& req = fs_reqs[i];
total_multi_read_size += req.len;
}
size_t remaining_bytes = total_multi_read_size;
size_t request_bytes = 0;
while (remaining_bytes > 0) {
request_bytes = std::min(
static_cast<size_t>(rate_limiter_->GetSingleBurstBytes()),
remaining_bytes);
rate_limiter_->Request(request_bytes, rate_limiter_priority,
nullptr /* stats */,
RateLimiter::OpType::kRead);
remaining_bytes -= request_bytes;
}
}
io_s = file_->MultiRead(fs_reqs, num_fs_reqs, opts, nullptr);
RecordInHistogram(stats_, MULTIGET_IO_BATCH_SIZE, num_fs_reqs);
}
if (use_direct_io()) {
// Populate results in the unaligned read requests.
size_t aligned_i = 0;
for (size_t i = 0; i < num_reqs; i++) {
auto& r = read_reqs[i];
if (static_cast<size_t>(r.offset) > End(aligned_reqs[aligned_i])) {
aligned_i++;
}
const auto& fs_r = fs_reqs[aligned_i];
r.status = fs_r.status;
if (r.status.ok()) {
uint64_t offset = r.offset - fs_r.offset;
if (fs_r.result.size() <= offset) {
// No byte in the read range is returned.
r.result = Slice();
} else {
size_t len = std::min(
r.len, static_cast<size_t>(fs_r.result.size() - offset));
r.result = Slice(fs_r.scratch + offset, len);
}
} else {
r.result = Slice();
}
}
}
for (size_t i = 0; i < num_reqs; ++i) {
if (ShouldNotifyListeners()) {
auto finish_ts = FileOperationInfo::FinishNow();
NotifyOnFileReadFinish(read_reqs[i].offset, read_reqs[i].result.size(),
start_ts, finish_ts, read_reqs[i].status);
}
if (!read_reqs[i].status.ok()) {
NotifyOnIOError(read_reqs[i].status, FileOperationType::kRead,
file_name(), read_reqs[i].result.size(),
read_reqs[i].offset);
}
RecordIOStats(stats_, file_temperature_, is_last_level_,
read_reqs[i].result.size());
}
SetPerfLevel(prev_perf_level);
}
if (stats_ != nullptr && file_read_hist_ != nullptr) {
file_read_hist_->Add(elapsed);
}
return io_s;
}
IOStatus RandomAccessFileReader::PrepareIOOptions(const ReadOptions& ro,
Group rocksdb.sst.read.micros stat by IOActivity flush and compaction (#11288) Summary: **Context:** The existing stat rocksdb.sst.read.micros does not reflect each of compaction and flush cases but aggregate them, which is not so helpful for us to understand IO read behavior of each of them. **Summary** - Update `StopWatch` and `RandomAccessFileReader` to record `rocksdb.sst.read.micros` and `rocksdb.file.{flush/compaction}.read.micros` - Fixed the default histogram in `RandomAccessFileReader` - New field `ReadOptions/IOOptions::io_activity`; Pass `ReadOptions` through paths under db open, flush and compaction to where we can prepare `IOOptions` and pass it to `RandomAccessFileReader` - Use `thread_status_util` for assertion in `DbStressFSWrapper` for continuous testing on we are passing correct `io_activity` under db open, flush and compaction Pull Request resolved: https://github.com/facebook/rocksdb/pull/11288 Test Plan: - **Stress test** - **Db bench 1: rocksdb.sst.read.micros COUNT ≈ sum of rocksdb.file.read.flush.micros's and rocksdb.file.read.compaction.micros's.** (without blob) - May not be exactly the same due to `HistogramStat::Add` only guarantees atomic not accuracy across threads. ``` ./db_bench -db=/dev/shm/testdb/ -statistics=true -benchmarks="fillseq" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -target_file_size_base=655 -disable_auto_compactions=false -compression_type=none -bloom_bits=3 (-use_plain_table=1 -prefix_size=10) ``` ``` // BlockBasedTable rocksdb.sst.read.micros P50 : 2.009374 P95 : 4.968548 P99 : 8.110362 P100 : 43.000000 COUNT : 40456 SUM : 114805 rocksdb.file.read.flush.micros P50 : 1.871841 P95 : 3.872407 P99 : 5.540541 P100 : 43.000000 COUNT : 2250 SUM : 6116 rocksdb.file.read.compaction.micros P50 : 2.023109 P95 : 5.029149 P99 : 8.196910 P100 : 26.000000 COUNT : 38206 SUM : 108689 // PlainTable Does not apply ``` - **Db bench 2: performance** **Read** SETUP: db with 900 files ``` ./db_bench -db=/dev/shm/testdb/ -benchmarks="fillseq" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -disable_auto_compactions=true -target_file_size_base=655 -compression_type=none ```run till convergence ``` ./db_bench -seed=1678564177044286 -use_existing_db=true -db=/dev/shm/testdb -benchmarks=readrandom[-X60] -statistics=true -num=1000000 -disable_auto_compactions=true -compression_type=none -bloom_bits=3 ``` Pre-change `readrandom [AVG 60 runs] : 21568 (± 248) ops/sec` Post-change (no regression, -0.3%) `readrandom [AVG 60 runs] : 21486 (± 236) ops/sec` **Compaction/Flush**run till convergence ``` ./db_bench -db=/dev/shm/testdb2/ -seed=1678564177044286 -benchmarks="fillseq[-X60]" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -disable_auto_compactions=false -target_file_size_base=655 -compression_type=none rocksdb.sst.read.micros COUNT : 33820 rocksdb.sst.read.flush.micros COUNT : 1800 rocksdb.sst.read.compaction.micros COUNT : 32020 ``` Pre-change `fillseq [AVG 46 runs] : 1391 (± 214) ops/sec; 0.7 (± 0.1) MB/sec` Post-change (no regression, ~-0.4%) `fillseq [AVG 46 runs] : 1385 (± 216) ops/sec; 0.7 (± 0.1) MB/sec` Reviewed By: ajkr Differential Revision: D44007011 Pulled By: hx235 fbshipit-source-id: a54c89e4846dfc9a135389edf3f3eedfea257132
2023-04-21 16:07:18 +00:00
IOOptions& opts) const {
if (clock_ != nullptr) {
return PrepareIOFromReadOptions(ro, clock_, opts);
} else {
return PrepareIOFromReadOptions(ro, SystemClock::Default().get(), opts);
}
}
Provide implementation to prefetch data asynchronously in FilePrefetchBuffer (#9674) Summary: In FilePrefetchBuffer if reads are sequential, after prefetching call ReadAsync API to prefetch data asynchronously so that in next prefetching data will be available. Data prefetched asynchronously will be readahead_size/2. It uses two buffers, one for synchronous prefetching and one for asynchronous. In case, the data is overlapping, the data is copied from both buffers to third buffer to make it continuous. This feature is under ReadOptions::async_io and is under experimental. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9674 Test Plan: 1. Add new unit tests 2. Run **db_stress** to make sure nothing crashes. - Normal prefetch without `async_io` ran successfully: ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` 3. **Run Regressions**. i) Main branch without any change for normal prefetching with async_io disabled: ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 - use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_withchange -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 14:11:31 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_withchange] seekrandom : 471347.227 micros/op 2 ops/sec; 348.1 MB/s (255 of 255 found) ``` Reviewed By: anand1976 Differential Revision: D34731543 Pulled By: akankshamahajan15 fbshipit-source-id: 8e23aa93453d5fe3c672b9231ad582f60207937f
2022-03-21 14:12:43 +00:00
IOStatus RandomAccessFileReader::ReadAsync(
FSReadRequest& req, const IOOptions& opts,
std::function<void(const FSReadRequest&, void*)> cb, void* cb_arg,
void** io_handle, IOHandleDeleter* del_fn, AlignedBuf* aligned_buf) {
IOStatus s;
// Create a callback and populate info.
auto read_async_callback =
std::bind(&RandomAccessFileReader::ReadAsyncCallback, this,
std::placeholders::_1, std::placeholders::_2);
Add AsyncIO support for tuning readahead_size by block cache lookup (#11936) Summary: Add support for tuning of readahead_size by block cache lookup for async_io. **Design/ Implementation** - **BlockBasedTableIterator.cc** - `BlockCacheLookupForReadAheadSize` callback API lookups in the block cache and tries to reduce the start and end offset passed. This function looks into the block cache for the blocks between `start_offset` and `end_offset` and add all the handles in the queue. It then iterates from the end in the handles to find first miss block and update the end offset to that block. It also iterates from the start and find first miss block and update the start offset to that block. ``` _read_curr_block_ argument : True if this call was due to miss in the cache and caller wants to read that block synchronously. False if current call is to prefetch additional data in extra buffers (due to ReadAsync call in FilePrefetchBuffer) ``` In case there is no data to be read in that callback (because of upper_bound or all blocks are in cache), it updates start and end offset to be equal and that `FilePrefetchBuffer` interprets that as 0 length to be read. **FilePrefetchBuffer.cc** - FilePrefetchBuffer calls the callback - `ReadAheadSizeTuning` and pass the start and end offset to that callback to get updated start and end offset to read based on cache hits/misses. 1. In case of Read calls (when offset passed to FilePrefetchBuffer is on cache miss and that data needs to be read), _read_curr_block_ is passed true. 2. In case of ReadAsync calls, when buffer is all consumed and can go for additional prefetching, the start offset passed is the initial end offset of prev buffer (without any updated offset based on cache hit/miss). Foreg. if following are the data blocks with cache hit/miss and start offset and Read API found miss on DB1 and based on readahead_size (50) it passes end offset to be 50. [DB1 - miss- 0 ] [DB2 - hit -10] [DB3 - miss -20] [DB4 - miss-30] [DB5 - hit-40] [DB6 - hit-50] [DB7 - miss-60] [DB8 - miss - 70] [DB9 - hit - 80] [DB6 - hit 90] - For Read call - updated start offset remains 0 but end offset updates to DB4, as DB5 is in cache. - Read calls saves initial end offset 50 as that was meant to be prefetched. - Now for next ReadAsync call - the start offset will be 50 (previous buffer initial end offset) and based on readahead_size, end offset will be 100 - On callback, because of cache hits - callback will update the start offset to 60 and end offset to 80 to read only 2 data blocks (DB7 and DB8). - And for that ReadAsync call - initial end offset will be set to 100 which will again used by next ReadAsync call as start offset. - `initial_end_offset_` in `BufferInfo` is used to save the initial end offset of that buffer. - If let's say DB5 and DB6 overlaps in 2 buffers (because of alignment), `prev_buf_end_offset` is passed to make sure already prefetched data is not prefetched again in second buffer. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11936 Test Plan: - Ran crash_test several times. - New unit tests added. Reviewed By: anand1976 Differential Revision: D50906217 Pulled By: akankshamahajan15 fbshipit-source-id: 0d75d3c98274e98aa34901b201b8fb05232139cf
2023-12-06 21:48:15 +00:00
ReadAsyncInfo* read_async_info = new ReadAsyncInfo(
cb, cb_arg, (clock_ != nullptr ? clock_->NowMicros() : 0));
if (ShouldNotifyListeners()) {
read_async_info->fs_start_ts_ = FileOperationInfo::StartNow();
}
size_t alignment = file_->GetRequiredBufferAlignment();
bool is_aligned = (req.offset & (alignment - 1)) == 0 &&
(req.len & (alignment - 1)) == 0 &&
(uintptr_t(req.scratch) & (alignment - 1)) == 0;
read_async_info->is_aligned_ = is_aligned;
uint64_t elapsed = 0;
if (use_direct_io() && is_aligned == false) {
FSReadRequest aligned_req = Align(req, alignment);
Async optimization in scan path (#10602) Summary: Optimizations 1. In FilePrefetchBuffer, when data is overlapping between two buffers, it copies the data from first to third buffer, then from second to third buffer to return continuous buffer. This optimization will call ReadAsync on first buffer as soon as buffer is empty instead of getting blocked by second buffer to copy the data. 2. For fixed size readahead_size, FilePrefetchBuffer will issues two async read calls. One with length + readahead_size_/2 on first buffer(if buffer is empty) and readahead_size_/2 on second buffer during seek. - Add readahead_size to db_stress for stress testing these changes in https://github.com/facebook/rocksdb/pull/10632 Pull Request resolved: https://github.com/facebook/rocksdb/pull/10602 Test Plan: - CircleCI tests - stress_test completed successfully export CRASH_TEST_EXT_ARGS="--async_io=1" make crash_test -j32 - db_bench showed no regression With this PR: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876074584472 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:14:34 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 270878.018 micros/op 3 ops/sec 30.068 seconds 111 operations; 618.7 MB/s (111 of 111 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=true -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661875332862922 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:02:12 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 WARNING: Assertions are enabled; benchmarks unnecessarily slow ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 358352.488 micros/op 2 ops/sec 30.102 seconds 84 operations; 474.4 MB/s (84 of 84 found) ``` Without PR in main: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=1 Set seed to 1661876425983045 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:20:26 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 280881.953 micros/op 3 ops/sec 30.054 seconds 107 operations; 605.2 MB/s (107 of 107 found) ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main1 -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=50000000 -use_direct_reads=false -seek_nexts=327680 -duration=30 -ops_between_duration_checks=1 -async_io=0 Set seed to 1661876475267771 because --seed was 0 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags Integrated BlobDB: blob cache disabled RocksDB: version 7.7.0 Date: Tue Aug 30 09:21:15 2022 CPU: 32 * Intel Xeon Processor (Skylake) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 50000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 25939.9 MB (estimated) FileSize: 13732.9 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main1] seekrandom : 363155.084 micros/op 2 ops/sec 30.142 seconds 83 operations; 468.1 MB/s (83 of 83 found) ``` Reviewed By: anand1976 Differential Revision: D39141328 Pulled By: akankshamahajan15 fbshipit-source-id: 560655922c1a437a8569c228abb31b8c0b413120
2022-09-13 00:42:01 +00:00
aligned_req.status.PermitUncheckedError();
// Allocate aligned buffer.
read_async_info->buf_.Alignment(alignment);
read_async_info->buf_.AllocateNewBuffer(aligned_req.len);
// Set rem fields in aligned FSReadRequest.
aligned_req.scratch = read_async_info->buf_.BufferStart();
// Set user provided fields to populate back in callback.
read_async_info->user_scratch_ = req.scratch;
read_async_info->user_aligned_buf_ = aligned_buf;
read_async_info->user_len_ = req.len;
read_async_info->user_offset_ = req.offset;
read_async_info->user_result_ = req.result;
assert(read_async_info->buf_.CurrentSize() == 0);
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444) Summary: **Context/Summary:** - Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`. - For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation) - New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main. - Misc - More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority` - Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444 Test Plan: - CI fake db crash/stress test - Microbenchmarking **Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench` - google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd - db_basic_bench_base: upstream - db_basic_bench_pr: db_basic_bench_base + this PR - asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read) - asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR **Test** Get ``` TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000 ``` Result ``` Coming soon ``` AsyncRead ``` TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out ``` Result ``` Base: 1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008, PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before): 1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053, ``` Reviewed By: ajkr Differential Revision: D45918925 Pulled By: hx235 fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
StopWatch sw(clock_, stats_, hist_type_,
GetFileReadHistograms(stats_, opts.io_activity),
(stats_ != nullptr) ? &elapsed : nullptr, true /*overwrite*/,
Group rocksdb.sst.read.micros stat by IOActivity flush and compaction (#11288) Summary: **Context:** The existing stat rocksdb.sst.read.micros does not reflect each of compaction and flush cases but aggregate them, which is not so helpful for us to understand IO read behavior of each of them. **Summary** - Update `StopWatch` and `RandomAccessFileReader` to record `rocksdb.sst.read.micros` and `rocksdb.file.{flush/compaction}.read.micros` - Fixed the default histogram in `RandomAccessFileReader` - New field `ReadOptions/IOOptions::io_activity`; Pass `ReadOptions` through paths under db open, flush and compaction to where we can prepare `IOOptions` and pass it to `RandomAccessFileReader` - Use `thread_status_util` for assertion in `DbStressFSWrapper` for continuous testing on we are passing correct `io_activity` under db open, flush and compaction Pull Request resolved: https://github.com/facebook/rocksdb/pull/11288 Test Plan: - **Stress test** - **Db bench 1: rocksdb.sst.read.micros COUNT ≈ sum of rocksdb.file.read.flush.micros's and rocksdb.file.read.compaction.micros's.** (without blob) - May not be exactly the same due to `HistogramStat::Add` only guarantees atomic not accuracy across threads. ``` ./db_bench -db=/dev/shm/testdb/ -statistics=true -benchmarks="fillseq" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -target_file_size_base=655 -disable_auto_compactions=false -compression_type=none -bloom_bits=3 (-use_plain_table=1 -prefix_size=10) ``` ``` // BlockBasedTable rocksdb.sst.read.micros P50 : 2.009374 P95 : 4.968548 P99 : 8.110362 P100 : 43.000000 COUNT : 40456 SUM : 114805 rocksdb.file.read.flush.micros P50 : 1.871841 P95 : 3.872407 P99 : 5.540541 P100 : 43.000000 COUNT : 2250 SUM : 6116 rocksdb.file.read.compaction.micros P50 : 2.023109 P95 : 5.029149 P99 : 8.196910 P100 : 26.000000 COUNT : 38206 SUM : 108689 // PlainTable Does not apply ``` - **Db bench 2: performance** **Read** SETUP: db with 900 files ``` ./db_bench -db=/dev/shm/testdb/ -benchmarks="fillseq" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -disable_auto_compactions=true -target_file_size_base=655 -compression_type=none ```run till convergence ``` ./db_bench -seed=1678564177044286 -use_existing_db=true -db=/dev/shm/testdb -benchmarks=readrandom[-X60] -statistics=true -num=1000000 -disable_auto_compactions=true -compression_type=none -bloom_bits=3 ``` Pre-change `readrandom [AVG 60 runs] : 21568 (± 248) ops/sec` Post-change (no regression, -0.3%) `readrandom [AVG 60 runs] : 21486 (± 236) ops/sec` **Compaction/Flush**run till convergence ``` ./db_bench -db=/dev/shm/testdb2/ -seed=1678564177044286 -benchmarks="fillseq[-X60]" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -disable_auto_compactions=false -target_file_size_base=655 -compression_type=none rocksdb.sst.read.micros COUNT : 33820 rocksdb.sst.read.flush.micros COUNT : 1800 rocksdb.sst.read.compaction.micros COUNT : 32020 ``` Pre-change `fillseq [AVG 46 runs] : 1391 (± 214) ops/sec; 0.7 (± 0.1) MB/sec` Post-change (no regression, ~-0.4%) `fillseq [AVG 46 runs] : 1385 (± 216) ops/sec; 0.7 (± 0.1) MB/sec` Reviewed By: ajkr Differential Revision: D44007011 Pulled By: hx235 fbshipit-source-id: a54c89e4846dfc9a135389edf3f3eedfea257132
2023-04-21 16:07:18 +00:00
true /*delay_enabled*/);
s = file_->ReadAsync(aligned_req, opts, read_async_callback,
read_async_info, io_handle, del_fn, nullptr /*dbg*/);
} else {
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444) Summary: **Context/Summary:** - Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`. - For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation) - New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main. - Misc - More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority` - Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444 Test Plan: - CI fake db crash/stress test - Microbenchmarking **Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench` - google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd - db_basic_bench_base: upstream - db_basic_bench_pr: db_basic_bench_base + this PR - asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read) - asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR **Test** Get ``` TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000 ``` Result ``` Coming soon ``` AsyncRead ``` TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out ``` Result ``` Base: 1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008, PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before): 1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053, ``` Reviewed By: ajkr Differential Revision: D45918925 Pulled By: hx235 fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
StopWatch sw(clock_, stats_, hist_type_,
GetFileReadHistograms(stats_, opts.io_activity),
(stats_ != nullptr) ? &elapsed : nullptr, true /*overwrite*/,
Group rocksdb.sst.read.micros stat by IOActivity flush and compaction (#11288) Summary: **Context:** The existing stat rocksdb.sst.read.micros does not reflect each of compaction and flush cases but aggregate them, which is not so helpful for us to understand IO read behavior of each of them. **Summary** - Update `StopWatch` and `RandomAccessFileReader` to record `rocksdb.sst.read.micros` and `rocksdb.file.{flush/compaction}.read.micros` - Fixed the default histogram in `RandomAccessFileReader` - New field `ReadOptions/IOOptions::io_activity`; Pass `ReadOptions` through paths under db open, flush and compaction to where we can prepare `IOOptions` and pass it to `RandomAccessFileReader` - Use `thread_status_util` for assertion in `DbStressFSWrapper` for continuous testing on we are passing correct `io_activity` under db open, flush and compaction Pull Request resolved: https://github.com/facebook/rocksdb/pull/11288 Test Plan: - **Stress test** - **Db bench 1: rocksdb.sst.read.micros COUNT ≈ sum of rocksdb.file.read.flush.micros's and rocksdb.file.read.compaction.micros's.** (without blob) - May not be exactly the same due to `HistogramStat::Add` only guarantees atomic not accuracy across threads. ``` ./db_bench -db=/dev/shm/testdb/ -statistics=true -benchmarks="fillseq" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -target_file_size_base=655 -disable_auto_compactions=false -compression_type=none -bloom_bits=3 (-use_plain_table=1 -prefix_size=10) ``` ``` // BlockBasedTable rocksdb.sst.read.micros P50 : 2.009374 P95 : 4.968548 P99 : 8.110362 P100 : 43.000000 COUNT : 40456 SUM : 114805 rocksdb.file.read.flush.micros P50 : 1.871841 P95 : 3.872407 P99 : 5.540541 P100 : 43.000000 COUNT : 2250 SUM : 6116 rocksdb.file.read.compaction.micros P50 : 2.023109 P95 : 5.029149 P99 : 8.196910 P100 : 26.000000 COUNT : 38206 SUM : 108689 // PlainTable Does not apply ``` - **Db bench 2: performance** **Read** SETUP: db with 900 files ``` ./db_bench -db=/dev/shm/testdb/ -benchmarks="fillseq" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -disable_auto_compactions=true -target_file_size_base=655 -compression_type=none ```run till convergence ``` ./db_bench -seed=1678564177044286 -use_existing_db=true -db=/dev/shm/testdb -benchmarks=readrandom[-X60] -statistics=true -num=1000000 -disable_auto_compactions=true -compression_type=none -bloom_bits=3 ``` Pre-change `readrandom [AVG 60 runs] : 21568 (± 248) ops/sec` Post-change (no regression, -0.3%) `readrandom [AVG 60 runs] : 21486 (± 236) ops/sec` **Compaction/Flush**run till convergence ``` ./db_bench -db=/dev/shm/testdb2/ -seed=1678564177044286 -benchmarks="fillseq[-X60]" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -disable_auto_compactions=false -target_file_size_base=655 -compression_type=none rocksdb.sst.read.micros COUNT : 33820 rocksdb.sst.read.flush.micros COUNT : 1800 rocksdb.sst.read.compaction.micros COUNT : 32020 ``` Pre-change `fillseq [AVG 46 runs] : 1391 (± 214) ops/sec; 0.7 (± 0.1) MB/sec` Post-change (no regression, ~-0.4%) `fillseq [AVG 46 runs] : 1385 (± 216) ops/sec; 0.7 (± 0.1) MB/sec` Reviewed By: ajkr Differential Revision: D44007011 Pulled By: hx235 fbshipit-source-id: a54c89e4846dfc9a135389edf3f3eedfea257132
2023-04-21 16:07:18 +00:00
true /*delay_enabled*/);
s = file_->ReadAsync(req, opts, read_async_callback, read_async_info,
io_handle, del_fn, nullptr /*dbg*/);
}
RecordTick(stats_, READ_ASYNC_MICROS, elapsed);
Provide support for IOTracing for ReadAsync API (#9833) Summary: Same as title Pull Request resolved: https://github.com/facebook/rocksdb/pull/9833 Test Plan: Add unit test and manually check the output of tracing logs For fixed readahead_size it logs as: ``` Access Time : 193352113447923 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 15075 , IO Status: OK, Length: 12288, Offset: 659456 Access Time : 193352113465232 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 14425 , IO Status: OK, Length: 12288, Offset: 671744 Access Time : 193352113481539 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13062 , IO Status: OK, Length: 12288, Offset: 684032 Access Time : 193352113497692 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13649 , IO Status: OK, Length: 12288, Offset: 696320 Access Time : 193352113520043 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 19384 , IO Status: OK, Length: 12288, Offset: 708608 Access Time : 193352113538401 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 15406 , IO Status: OK, Length: 12288, Offset: 720896 Access Time : 193352113554855 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13670 , IO Status: OK, Length: 12288, Offset: 733184 Access Time : 193352113571624 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13855 , IO Status: OK, Length: 12288, Offset: 745472 Access Time : 193352113587924 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13953 , IO Status: OK, Length: 12288, Offset: 757760 Access Time : 193352113603285 , File Name: 000026.sst , File Operation: Prefetch , Latency: 59 , IO Status: Not implemented: Prefetch not supported, Length: 8868, Offset: 898349 ``` For implicit readahead: ``` Access Time : 193351865156587 , File Name: 000026.sst , File Operation: Prefetch , Latency: 48 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 391174 Access Time : 193351865160354 , File Name: 000026.sst , File Operation: Prefetch , Latency: 51 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 395248 Access Time : 193351865164253 , File Name: 000026.sst , File Operation: Prefetch , Latency: 49 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 399322 Access Time : 193351865165461 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 222871 , IO Status: OK, Length: 135168, Offset: 401408 ``` Reviewed By: anand1976 Differential Revision: D35601634 Pulled By: akankshamahajan15 fbshipit-source-id: 5a4f32a850af878efa0767bd5706380152a1f26e
2022-05-26 02:47:03 +00:00
// Suppress false positive clang analyzer warnings.
// Memory is not released if file_->ReadAsync returns !s.ok(), because
// ReadAsyncCallback is never called in that case. If ReadAsyncCallback is
// called then ReadAsync should always return IOStatus::OK().
#ifndef __clang_analyzer__
if (!s.ok()) {
delete read_async_info;
}
Provide support for IOTracing for ReadAsync API (#9833) Summary: Same as title Pull Request resolved: https://github.com/facebook/rocksdb/pull/9833 Test Plan: Add unit test and manually check the output of tracing logs For fixed readahead_size it logs as: ``` Access Time : 193352113447923 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 15075 , IO Status: OK, Length: 12288, Offset: 659456 Access Time : 193352113465232 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 14425 , IO Status: OK, Length: 12288, Offset: 671744 Access Time : 193352113481539 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13062 , IO Status: OK, Length: 12288, Offset: 684032 Access Time : 193352113497692 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13649 , IO Status: OK, Length: 12288, Offset: 696320 Access Time : 193352113520043 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 19384 , IO Status: OK, Length: 12288, Offset: 708608 Access Time : 193352113538401 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 15406 , IO Status: OK, Length: 12288, Offset: 720896 Access Time : 193352113554855 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13670 , IO Status: OK, Length: 12288, Offset: 733184 Access Time : 193352113571624 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13855 , IO Status: OK, Length: 12288, Offset: 745472 Access Time : 193352113587924 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13953 , IO Status: OK, Length: 12288, Offset: 757760 Access Time : 193352113603285 , File Name: 000026.sst , File Operation: Prefetch , Latency: 59 , IO Status: Not implemented: Prefetch not supported, Length: 8868, Offset: 898349 ``` For implicit readahead: ``` Access Time : 193351865156587 , File Name: 000026.sst , File Operation: Prefetch , Latency: 48 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 391174 Access Time : 193351865160354 , File Name: 000026.sst , File Operation: Prefetch , Latency: 51 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 395248 Access Time : 193351865164253 , File Name: 000026.sst , File Operation: Prefetch , Latency: 49 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 399322 Access Time : 193351865165461 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 222871 , IO Status: OK, Length: 135168, Offset: 401408 ``` Reviewed By: anand1976 Differential Revision: D35601634 Pulled By: akankshamahajan15 fbshipit-source-id: 5a4f32a850af878efa0767bd5706380152a1f26e
2022-05-26 02:47:03 +00:00
#endif // __clang_analyzer__
return s;
}
void RandomAccessFileReader::ReadAsyncCallback(const FSReadRequest& req,
void* cb_arg) {
ReadAsyncInfo* read_async_info = static_cast<ReadAsyncInfo*>(cb_arg);
assert(read_async_info);
assert(read_async_info->cb_);
if (use_direct_io() && read_async_info->is_aligned_ == false) {
// Create FSReadRequest with user provided fields.
FSReadRequest user_req;
user_req.scratch = read_async_info->user_scratch_;
user_req.offset = read_async_info->user_offset_;
user_req.len = read_async_info->user_len_;
// Update results in user_req.
user_req.result = req.result;
user_req.status = req.status;
read_async_info->buf_.Size(read_async_info->buf_.CurrentSize() +
req.result.size());
size_t offset_advance_len = static_cast<size_t>(
/*offset_passed_by_user=*/read_async_info->user_offset_ -
/*aligned_offset=*/req.offset);
size_t res_len = 0;
if (req.status.ok() &&
offset_advance_len < read_async_info->buf_.CurrentSize()) {
res_len =
std::min(read_async_info->buf_.CurrentSize() - offset_advance_len,
read_async_info->user_len_);
if (read_async_info->user_aligned_buf_ == nullptr) {
// Copy the data into user's scratch.
// Clang analyzer assumes that it will take use_direct_io() == false in
// ReadAsync and use_direct_io() == true in Callback which cannot be true.
#ifndef __clang_analyzer__
read_async_info->buf_.Read(user_req.scratch, offset_advance_len,
res_len);
#endif // __clang_analyzer__
} else {
// Set aligned_buf provided by user without additional copy.
user_req.scratch =
read_async_info->buf_.BufferStart() + offset_advance_len;
read_async_info->user_aligned_buf_->reset(
read_async_info->buf_.Release());
}
user_req.result = Slice(user_req.scratch, res_len);
} else {
// Either req.status is not ok or data was not read.
user_req.result = Slice();
}
read_async_info->cb_(user_req, read_async_info->cb_arg_);
} else {
read_async_info->cb_(req, read_async_info->cb_arg_);
}
// Update stats and notify listeners.
if (stats_ != nullptr && file_read_hist_ != nullptr) {
// elapsed doesn't take into account delay and overwrite as StopWatch does
// in Read.
uint64_t elapsed = clock_->NowMicros() - read_async_info->start_time_;
file_read_hist_->Add(elapsed);
}
if (req.status.ok()) {
RecordInHistogram(stats_, ASYNC_READ_BYTES, req.result.size());
} else if (!req.status.IsAborted()) {
RecordTick(stats_, ASYNC_READ_ERROR_COUNT, 1);
}
if (ShouldNotifyListeners()) {
auto finish_ts = FileOperationInfo::FinishNow();
NotifyOnFileReadFinish(req.offset, req.result.size(),
read_async_info->fs_start_ts_, finish_ts,
req.status);
}
if (!req.status.ok()) {
NotifyOnIOError(req.status, FileOperationType::kRead, file_name(),
req.result.size(), req.offset);
}
RecordIOStats(stats_, file_temperature_, is_last_level_, req.result.size());
delete read_async_info;
Provide implementation to prefetch data asynchronously in FilePrefetchBuffer (#9674) Summary: In FilePrefetchBuffer if reads are sequential, after prefetching call ReadAsync API to prefetch data asynchronously so that in next prefetching data will be available. Data prefetched asynchronously will be readahead_size/2. It uses two buffers, one for synchronous prefetching and one for asynchronous. In case, the data is overlapping, the data is copied from both buffers to third buffer to make it continuous. This feature is under ReadOptions::async_io and is under experimental. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9674 Test Plan: 1. Add new unit tests 2. Run **db_stress** to make sure nothing crashes. - Normal prefetch without `async_io` ran successfully: ``` export CRASH_TEST_EXT_ARGS=" --async_io=0" make crash_test -j ``` 3. **Run Regressions**. i) Main branch without any change for normal prefetching with async_io disabled: ``` ./db_bench -db=/tmp/prefix_scan_prefetch_main -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000000 - use_direct_io_for_flush_and_compaction=true -target_file_size_base=16777216 ``` ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_main -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 13:11:34 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_main] seekrandom : 483618.390 micros/op 2 ops/sec; 338.9 MB/s (249 of 249 found) ``` ii) normal prefetching after changes with async_io disable: ``` ./db_bench -use_existing_db=true -db=/tmp/prefix_scan_prefetch_withchange -benchmarks="seekrandom" -key_size=32 -value_size=512 -num=5000000 -use_direct_reads=true -seek_nexts=327680 -duration=120 -ops_between_duration_checks=1 Initializing RocksDB Options from the specified file Initializing RocksDB Options from command-line flags RocksDB: version 7.0 Date: Thu Mar 17 14:11:31 2022 CPU: 24 * Intel Core Processor (Broadwell) CPUCache: 16384 KB Keys: 32 bytes each (+ 0 bytes user-defined timestamp) Values: 512 bytes each (256 bytes after compression) Entries: 5000000 Prefix: 0 bytes Keys per prefix: 0 RawSize: 2594.0 MB (estimated) FileSize: 1373.3 MB (estimated) Write rate: 0 bytes/second Read rate: 0 ops/second Compression: Snappy Compression sampling rate: 0 Memtablerep: SkipListFactory Perf Level: 1 ------------------------------------------------ DB path: [/tmp/prefix_scan_prefetch_withchange] seekrandom : 471347.227 micros/op 2 ops/sec; 348.1 MB/s (255 of 255 found) ``` Reviewed By: anand1976 Differential Revision: D34731543 Pulled By: akankshamahajan15 fbshipit-source-id: 8e23aa93453d5fe3c672b9231ad582f60207937f
2022-03-21 14:12:43 +00:00
}
} // namespace ROCKSDB_NAMESPACE