rocksdb/db/blob/blob_log_sequential_reader.cc

134 lines
4 KiB
C++
Raw Normal View History

// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
//
#include "db/blob/blob_log_sequential_reader.h"
#include "file/random_access_file_reader.h"
#include "monitoring/statistics_impl.h"
#include "util/stop_watch.h"
namespace ROCKSDB_NAMESPACE {
BlobLogSequentialReader::BlobLogSequentialReader(
std::unique_ptr<RandomAccessFileReader>&& file_reader, SystemClock* clock,
Statistics* statistics)
: file_(std::move(file_reader)),
clock_(clock),
statistics_(statistics),
next_byte_(0) {}
BlobLogSequentialReader::~BlobLogSequentialReader() = default;
Status BlobLogSequentialReader::ReadSlice(uint64_t size, Slice* slice,
char* buf) {
assert(slice);
assert(file_);
StopWatch read_sw(clock_, statistics_, BLOB_DB_BLOB_FILE_READ_MICROS);
Add rate limiter priority to ReadOptions (#9424) Summary: Users can set the priority for file reads associated with their operation by setting `ReadOptions::rate_limiter_priority` to something other than `Env::IO_TOTAL`. Rate limiting `VerifyChecksum()` and `VerifyFileChecksums()` is the motivation for this PR, so it also includes benchmarks and minor bug fixes to get that working. `RandomAccessFileReader::Read()` already had support for rate limiting compaction reads. I changed that rate limiting to be non-specific to compaction, but rather performed according to the passed in `Env::IOPriority`. Now the compaction read rate limiting is supported by setting `rate_limiter_priority = Env::IO_LOW` on its `ReadOptions`. There is no default value for the new `Env::IOPriority` parameter to `RandomAccessFileReader::Read()`. That means this PR goes through all callers (in some cases multiple layers up the call stack) to find a `ReadOptions` to provide the priority. There are TODOs for cases I believe it would be good to let user control the priority some day (e.g., file footer reads), and no TODO in cases I believe it doesn't matter (e.g., trace file reads). The API doc only lists the missing cases where a file read associated with a provided `ReadOptions` cannot be rate limited. For cases like file ingestion checksum calculation, there is no API to provide `ReadOptions` or `Env::IOPriority`, so I didn't count that as missing. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9424 Test Plan: - new unit tests - new benchmarks on ~50MB database with 1MB/s read rate limit and 100ms refill interval; verified with strace reads are chunked (at 0.1MB per chunk) and spaced roughly 100ms apart. - setup command: `./db_bench -benchmarks=fillrandom,compact -db=/tmp/testdb -target_file_size_base=1048576 -disable_auto_compactions=true -file_checksum=true` - benchmarks command: `strace -ttfe pread64 ./db_bench -benchmarks=verifychecksum,verifyfilechecksums -use_existing_db=true -db=/tmp/testdb -rate_limiter_bytes_per_sec=1048576 -rate_limit_bg_reads=1 -rate_limit_user_ops=true -file_checksum=true` - crash test using IO_USER priority on non-validation reads with https://github.com/facebook/rocksdb/issues/9567 reverted: `python3 tools/db_crashtest.py blackbox --max_key=1000000 --write_buffer_size=524288 --target_file_size_base=524288 --level_compaction_dynamic_level_bytes=true --duration=3600 --rate_limit_bg_reads=true --rate_limit_user_ops=true --rate_limiter_bytes_per_sec=10485760 --interval=10` Reviewed By: hx235 Differential Revision: D33747386 Pulled By: ajkr fbshipit-source-id: a2d985e97912fba8c54763798e04f006ccc56e0c
2022-02-17 07:17:03 +00:00
// TODO: rate limit `BlobLogSequentialReader` reads (it appears unused?)
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444) Summary: **Context/Summary:** - Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`. - For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation) - New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main. - Misc - More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority` - Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444 Test Plan: - CI fake db crash/stress test - Microbenchmarking **Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench` - google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd - db_basic_bench_base: upstream - db_basic_bench_pr: db_basic_bench_base + this PR - asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read) - asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR **Test** Get ``` TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000 ``` Result ``` Coming soon ``` AsyncRead ``` TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out ``` Result ``` Base: 1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008, PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before): 1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053, ``` Reviewed By: ajkr Differential Revision: D45918925 Pulled By: hx235 fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
Status s = file_->Read(IOOptions(), next_byte_, static_cast<size_t>(size),
slice, buf, nullptr);
next_byte_ += size;
if (!s.ok()) {
return s;
}
RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_READ, slice->size());
if (slice->size() != size) {
return Status::Corruption("EOF reached while reading record");
}
return s;
}
Status BlobLogSequentialReader::ReadHeader(BlobLogHeader* header) {
assert(header);
assert(next_byte_ == 0);
static_assert(BlobLogHeader::kSize <= sizeof(header_buf_),
"Buffer is smaller than BlobLogHeader::kSize");
Status s = ReadSlice(BlobLogHeader::kSize, &buffer_, header_buf_);
if (!s.ok()) {
return s;
}
if (buffer_.size() != BlobLogHeader::kSize) {
return Status::Corruption("EOF reached before file header");
}
return header->DecodeFrom(buffer_);
}
Status BlobLogSequentialReader::ReadRecord(BlobLogRecord* record,
ReadLevel level,
uint64_t* blob_offset) {
assert(record);
static_assert(BlobLogRecord::kHeaderSize <= sizeof(header_buf_),
"Buffer is smaller than BlobLogRecord::kHeaderSize");
Status s = ReadSlice(BlobLogRecord::kHeaderSize, &buffer_, header_buf_);
if (!s.ok()) {
return s;
}
if (buffer_.size() != BlobLogRecord::kHeaderSize) {
return Status::Corruption("EOF reached before record header");
}
s = record->DecodeHeaderFrom(buffer_);
if (!s.ok()) {
return s;
}
uint64_t kb_size = record->key_size + record->value_size;
if (blob_offset != nullptr) {
*blob_offset = next_byte_ + record->key_size;
}
switch (level) {
case kReadHeader:
next_byte_ += kb_size;
break;
case kReadHeaderKey:
record->key_buf.reset(new char[record->key_size]);
s = ReadSlice(record->key_size, &record->key, record->key_buf.get());
next_byte_ += record->value_size;
break;
case kReadHeaderKeyBlob:
record->key_buf.reset(new char[record->key_size]);
s = ReadSlice(record->key_size, &record->key, record->key_buf.get());
if (s.ok()) {
record->value_buf.reset(new char[record->value_size]);
s = ReadSlice(record->value_size, &record->value,
record->value_buf.get());
}
if (s.ok()) {
s = record->CheckBlobCRC();
}
break;
}
return s;
}
Status BlobLogSequentialReader::ReadFooter(BlobLogFooter* footer) {
assert(footer);
static_assert(BlobLogFooter::kSize <= sizeof(header_buf_),
"Buffer is smaller than BlobLogFooter::kSize");
Status s = ReadSlice(BlobLogFooter::kSize, &buffer_, header_buf_);
if (!s.ok()) {
return s;
}
if (buffer_.size() != BlobLogFooter::kSize) {
return Status::Corruption("EOF reached before file footer");
}
return footer->DecodeFrom(buffer_);
}
} // namespace ROCKSDB_NAMESPACE