rocksdb/utilities/persistent_cache/block_cache_tier_file.cc
Hui Xiao 9a034801ce Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444)
Summary:
**Context/Summary:**
- Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`.
   - For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation)
   - New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main.

- Misc
   - More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority`
   - Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now

Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444

Test Plan:
- CI fake db crash/stress test
- Microbenchmarking

**Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench`
- google benchmark version: 604f6fd3f4
- db_basic_bench_base: upstream
- db_basic_bench_pr: db_basic_bench_base + this PR
- asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read)
- asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR

**Test**

Get
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000
```

Result
```
Coming soon
```

AsyncRead
```
TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out
```

Result
```
Base:
1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008,

PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before):
1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053,
```

Reviewed By: ajkr

Differential Revision: D45918925

Pulled By: hx235

fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-08 17:26:50 -07:00

608 lines
16 KiB
C++

// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#include "utilities/persistent_cache/block_cache_tier_file.h"
#ifndef OS_WIN
#include <unistd.h>
#endif
#include <functional>
#include <memory>
#include <vector>
#include "env/composite_env_wrapper.h"
#include "logging/logging.h"
#include "port/port.h"
#include "rocksdb/system_clock.h"
#include "util/crc32c.h"
namespace ROCKSDB_NAMESPACE {
//
// File creation factories
//
Status NewWritableCacheFile(Env* const env, const std::string& filepath,
std::unique_ptr<WritableFile>* file,
const bool use_direct_writes = false) {
EnvOptions opt;
opt.use_direct_writes = use_direct_writes;
Status s = env->NewWritableFile(filepath, file, opt);
return s;
}
Status NewRandomAccessCacheFile(const std::shared_ptr<FileSystem>& fs,
const std::string& filepath,
std::unique_ptr<FSRandomAccessFile>* file,
const bool use_direct_reads = true) {
assert(fs.get());
FileOptions opt;
opt.use_direct_reads = use_direct_reads;
return fs->NewRandomAccessFile(filepath, opt, file, nullptr);
}
//
// BlockCacheFile
//
Status BlockCacheFile::Delete(uint64_t* size) {
assert(env_);
Status status = env_->GetFileSize(Path(), size);
if (!status.ok()) {
return status;
}
return env_->DeleteFile(Path());
}
//
// CacheRecord
//
// Cache record represents the record on disk
//
// +--------+---------+----------+------------+---------------+-------------+
// | magic | crc | key size | value size | key data | value data |
// +--------+---------+----------+------------+---------------+-------------+
// <-- 4 --><-- 4 --><-- 4 --><-- 4 --><-- key size --><-- v-size -->
//
struct CacheRecordHeader {
CacheRecordHeader() : magic_(0), crc_(0), key_size_(0), val_size_(0) {}
CacheRecordHeader(const uint32_t magic, const uint32_t key_size,
const uint32_t val_size)
: magic_(magic), crc_(0), key_size_(key_size), val_size_(val_size) {}
uint32_t magic_;
uint32_t crc_;
uint32_t key_size_;
uint32_t val_size_;
};
struct CacheRecord {
CacheRecord() {}
CacheRecord(const Slice& key, const Slice& val)
: hdr_(MAGIC, static_cast<uint32_t>(key.size()),
static_cast<uint32_t>(val.size())),
key_(key),
val_(val) {
hdr_.crc_ = ComputeCRC();
}
uint32_t ComputeCRC() const;
bool Serialize(std::vector<CacheWriteBuffer*>* bufs, size_t* woff);
bool Deserialize(const Slice& buf);
static uint32_t CalcSize(const Slice& key, const Slice& val) {
return static_cast<uint32_t>(sizeof(CacheRecordHeader) + key.size() +
val.size());
}
static const uint32_t MAGIC = 0xfefa;
bool Append(std::vector<CacheWriteBuffer*>* bufs, size_t* woff,
const char* data, const size_t size);
CacheRecordHeader hdr_;
Slice key_;
Slice val_;
};
static_assert(sizeof(CacheRecordHeader) == 16, "DataHeader is not aligned");
uint32_t CacheRecord::ComputeCRC() const {
uint32_t crc = 0;
CacheRecordHeader tmp = hdr_;
tmp.crc_ = 0;
crc = crc32c::Extend(crc, reinterpret_cast<const char*>(&tmp), sizeof(tmp));
crc = crc32c::Extend(crc, reinterpret_cast<const char*>(key_.data()),
key_.size());
crc = crc32c::Extend(crc, reinterpret_cast<const char*>(val_.data()),
val_.size());
return crc;
}
bool CacheRecord::Serialize(std::vector<CacheWriteBuffer*>* bufs,
size_t* woff) {
assert(bufs->size());
return Append(bufs, woff, reinterpret_cast<const char*>(&hdr_),
sizeof(hdr_)) &&
Append(bufs, woff, reinterpret_cast<const char*>(key_.data()),
key_.size()) &&
Append(bufs, woff, reinterpret_cast<const char*>(val_.data()),
val_.size());
}
bool CacheRecord::Append(std::vector<CacheWriteBuffer*>* bufs, size_t* woff,
const char* data, const size_t data_size) {
assert(*woff < bufs->size());
const char* p = data;
size_t size = data_size;
while (size && *woff < bufs->size()) {
CacheWriteBuffer* buf = (*bufs)[*woff];
const size_t free = buf->Free();
if (size <= free) {
buf->Append(p, size);
size = 0;
} else {
buf->Append(p, free);
p += free;
size -= free;
assert(!buf->Free());
assert(buf->Used() == buf->Capacity());
}
if (!buf->Free()) {
*woff += 1;
}
}
assert(!size);
return !size;
}
bool CacheRecord::Deserialize(const Slice& data) {
assert(data.size() >= sizeof(CacheRecordHeader));
if (data.size() < sizeof(CacheRecordHeader)) {
return false;
}
memcpy(&hdr_, data.data(), sizeof(hdr_));
assert(hdr_.key_size_ + hdr_.val_size_ + sizeof(hdr_) == data.size());
if (hdr_.key_size_ + hdr_.val_size_ + sizeof(hdr_) != data.size()) {
return false;
}
key_ = Slice(data.data_ + sizeof(hdr_), hdr_.key_size_);
val_ = Slice(key_.data_ + hdr_.key_size_, hdr_.val_size_);
if (!(hdr_.magic_ == MAGIC && ComputeCRC() == hdr_.crc_)) {
fprintf(stderr, "** magic %d ** \n", hdr_.magic_);
fprintf(stderr, "** key_size %d ** \n", hdr_.key_size_);
fprintf(stderr, "** val_size %d ** \n", hdr_.val_size_);
fprintf(stderr, "** key %s ** \n", key_.ToString().c_str());
fprintf(stderr, "** val %s ** \n", val_.ToString().c_str());
for (size_t i = 0; i < hdr_.val_size_; ++i) {
fprintf(stderr, "%d.", (uint8_t)val_.data()[i]);
}
fprintf(stderr, "\n** cksum %d != %d **", hdr_.crc_, ComputeCRC());
}
assert(hdr_.magic_ == MAGIC && ComputeCRC() == hdr_.crc_);
return hdr_.magic_ == MAGIC && ComputeCRC() == hdr_.crc_;
}
//
// RandomAccessFile
//
bool RandomAccessCacheFile::Open(const bool enable_direct_reads) {
WriteLock _(&rwlock_);
return OpenImpl(enable_direct_reads);
}
bool RandomAccessCacheFile::OpenImpl(const bool enable_direct_reads) {
rwlock_.AssertHeld();
ROCKS_LOG_DEBUG(log_, "Opening cache file %s", Path().c_str());
assert(env_);
std::unique_ptr<FSRandomAccessFile> file;
Status status = NewRandomAccessCacheFile(env_->GetFileSystem(), Path(), &file,
enable_direct_reads);
if (!status.ok()) {
Error(log_, "Error opening random access file %s. %s", Path().c_str(),
status.ToString().c_str());
return false;
}
freader_.reset(new RandomAccessFileReader(std::move(file), Path(),
env_->GetSystemClock().get()));
return true;
}
bool RandomAccessCacheFile::Read(const LBA& lba, Slice* key, Slice* val,
char* scratch) {
ReadLock _(&rwlock_);
assert(lba.cache_id_ == cache_id_);
if (!freader_) {
return false;
}
Slice result;
Status s = freader_->Read(IOOptions(), lba.off_, lba.size_, &result, scratch,
nullptr);
if (!s.ok()) {
Error(log_, "Error reading from file %s. %s", Path().c_str(),
s.ToString().c_str());
return false;
}
assert(result.data() == scratch);
return ParseRec(lba, key, val, scratch);
}
bool RandomAccessCacheFile::ParseRec(const LBA& lba, Slice* key, Slice* val,
char* scratch) {
Slice data(scratch, lba.size_);
CacheRecord rec;
if (!rec.Deserialize(data)) {
assert(!"Error deserializing data");
Error(log_, "Error de-serializing record from file %s off %d",
Path().c_str(), lba.off_);
return false;
}
*key = Slice(rec.key_);
*val = Slice(rec.val_);
return true;
}
//
// WriteableCacheFile
//
WriteableCacheFile::~WriteableCacheFile() {
WriteLock _(&rwlock_);
if (!eof_) {
// This file never flushed. We give priority to shutdown since this is a
// cache
// TODO(krad): Figure a way to flush the pending data
if (file_) {
assert(refs_ == 1);
--refs_;
}
}
assert(!refs_);
ClearBuffers();
}
bool WriteableCacheFile::Create(const bool /*enable_direct_writes*/,
const bool enable_direct_reads) {
WriteLock _(&rwlock_);
enable_direct_reads_ = enable_direct_reads;
ROCKS_LOG_DEBUG(log_, "Creating new cache %s (max size is %d B)",
Path().c_str(), max_size_);
assert(env_);
Status s = env_->FileExists(Path());
if (s.ok()) {
ROCKS_LOG_WARN(log_, "File %s already exists. %s", Path().c_str(),
s.ToString().c_str());
}
s = NewWritableCacheFile(env_, Path(), &file_);
if (!s.ok()) {
ROCKS_LOG_WARN(log_, "Unable to create file %s. %s", Path().c_str(),
s.ToString().c_str());
return false;
}
assert(!refs_);
++refs_;
return true;
}
bool WriteableCacheFile::Append(const Slice& key, const Slice& val, LBA* lba) {
WriteLock _(&rwlock_);
if (eof_) {
// We can't append since the file is full
return false;
}
// estimate the space required to store the (key, val)
uint32_t rec_size = CacheRecord::CalcSize(key, val);
if (!ExpandBuffer(rec_size)) {
// unable to expand the buffer
ROCKS_LOG_DEBUG(log_, "Error expanding buffers. size=%d", rec_size);
return false;
}
lba->cache_id_ = cache_id_;
lba->off_ = disk_woff_;
lba->size_ = rec_size;
CacheRecord rec(key, val);
if (!rec.Serialize(&bufs_, &buf_woff_)) {
// unexpected error: unable to serialize the data
assert(!"Error serializing record");
return false;
}
disk_woff_ += rec_size;
eof_ = disk_woff_ >= max_size_;
// dispatch buffer for flush
DispatchBuffer();
return true;
}
bool WriteableCacheFile::ExpandBuffer(const size_t size) {
rwlock_.AssertHeld();
assert(!eof_);
// determine if there is enough space
size_t free = 0; // compute the free space left in buffer
for (size_t i = buf_woff_; i < bufs_.size(); ++i) {
free += bufs_[i]->Free();
if (size <= free) {
// we have enough space in the buffer
return true;
}
}
// expand the buffer until there is enough space to write `size` bytes
assert(free < size);
assert(alloc_);
while (free < size) {
CacheWriteBuffer* const buf = alloc_->Allocate();
if (!buf) {
ROCKS_LOG_DEBUG(log_, "Unable to allocate buffers");
return false;
}
size_ += static_cast<uint32_t>(buf->Free());
free += buf->Free();
bufs_.push_back(buf);
}
assert(free >= size);
return true;
}
void WriteableCacheFile::DispatchBuffer() {
rwlock_.AssertHeld();
assert(bufs_.size());
assert(buf_doff_ <= buf_woff_);
assert(buf_woff_ <= bufs_.size());
if (pending_ios_) {
return;
}
if (!eof_ && buf_doff_ == buf_woff_) {
// dispatch buffer is pointing to write buffer and we haven't hit eof
return;
}
assert(eof_ || buf_doff_ < buf_woff_);
assert(buf_doff_ < bufs_.size());
assert(file_);
assert(alloc_);
auto* buf = bufs_[buf_doff_];
const uint64_t file_off = buf_doff_ * alloc_->BufferSize();
assert(!buf->Free() ||
(eof_ && buf_doff_ == buf_woff_ && buf_woff_ < bufs_.size()));
// we have reached end of file, and there is space in the last buffer
// pad it with zero for direct IO
buf->FillTrailingZeros();
assert(buf->Used() % kFileAlignmentSize == 0);
writer_->Write(file_.get(), buf, file_off,
std::bind(&WriteableCacheFile::BufferWriteDone, this));
pending_ios_++;
buf_doff_++;
}
void WriteableCacheFile::BufferWriteDone() {
WriteLock _(&rwlock_);
assert(bufs_.size());
pending_ios_--;
if (buf_doff_ < bufs_.size()) {
DispatchBuffer();
}
if (eof_ && buf_doff_ >= bufs_.size() && !pending_ios_) {
// end-of-file reached, move to read mode
CloseAndOpenForReading();
}
}
void WriteableCacheFile::CloseAndOpenForReading() {
// Our env abstraction do not allow reading from a file opened for appending
// We need close the file and re-open it for reading
Close();
RandomAccessCacheFile::OpenImpl(enable_direct_reads_);
}
bool WriteableCacheFile::ReadBuffer(const LBA& lba, Slice* key, Slice* block,
char* scratch) {
rwlock_.AssertHeld();
if (!ReadBuffer(lba, scratch)) {
Error(log_, "Error reading from buffer. cache=%d off=%d", cache_id_,
lba.off_);
return false;
}
return ParseRec(lba, key, block, scratch);
}
bool WriteableCacheFile::ReadBuffer(const LBA& lba, char* data) {
rwlock_.AssertHeld();
assert(lba.off_ < disk_woff_);
assert(alloc_);
// we read from the buffers like reading from a flat file. The list of buffers
// are treated as contiguous stream of data
char* tmp = data;
size_t pending_nbytes = lba.size_;
// start buffer
size_t start_idx = lba.off_ / alloc_->BufferSize();
// offset into the start buffer
size_t start_off = lba.off_ % alloc_->BufferSize();
assert(start_idx <= buf_woff_);
for (size_t i = start_idx; pending_nbytes && i < bufs_.size(); ++i) {
assert(i <= buf_woff_);
auto* buf = bufs_[i];
assert(i == buf_woff_ || !buf->Free());
// bytes to write to the buffer
size_t nbytes = pending_nbytes > (buf->Used() - start_off)
? (buf->Used() - start_off)
: pending_nbytes;
memcpy(tmp, buf->Data() + start_off, nbytes);
// left over to be written
pending_nbytes -= nbytes;
start_off = 0;
tmp += nbytes;
}
assert(!pending_nbytes);
if (pending_nbytes) {
return false;
}
assert(tmp == data + lba.size_);
return true;
}
void WriteableCacheFile::Close() {
rwlock_.AssertHeld();
assert(size_ >= max_size_);
assert(disk_woff_ >= max_size_);
assert(buf_doff_ == bufs_.size());
assert(bufs_.size() - buf_woff_ <= 1);
assert(!pending_ios_);
Info(log_, "Closing file %s. size=%d written=%d", Path().c_str(), size_,
disk_woff_);
ClearBuffers();
file_.reset();
assert(refs_);
--refs_;
}
void WriteableCacheFile::ClearBuffers() {
assert(alloc_);
for (size_t i = 0; i < bufs_.size(); ++i) {
alloc_->Deallocate(bufs_[i]);
}
bufs_.clear();
}
//
// ThreadedFileWriter implementation
//
ThreadedWriter::ThreadedWriter(PersistentCacheTier* const cache,
const size_t qdepth, const size_t io_size)
: Writer(cache), io_size_(io_size) {
for (size_t i = 0; i < qdepth; ++i) {
port::Thread th(&ThreadedWriter::ThreadMain, this);
threads_.push_back(std::move(th));
}
}
void ThreadedWriter::Stop() {
// notify all threads to exit
for (size_t i = 0; i < threads_.size(); ++i) {
q_.Push(IO(/*signal=*/true));
}
// wait for all threads to exit
for (auto& th : threads_) {
th.join();
assert(!th.joinable());
}
threads_.clear();
}
void ThreadedWriter::Write(WritableFile* const file, CacheWriteBuffer* buf,
const uint64_t file_off,
const std::function<void()> callback) {
q_.Push(IO(file, buf, file_off, callback));
}
void ThreadedWriter::ThreadMain() {
while (true) {
// Fetch the IO to process
IO io(q_.Pop());
if (io.signal_) {
// that's secret signal to exit
break;
}
// Reserve space for writing the buffer
while (!cache_->Reserve(io.buf_->Used())) {
// We can fail to reserve space if every file in the system
// is being currently accessed
/* sleep override */
SystemClock::Default()->SleepForMicroseconds(1000000);
}
DispatchIO(io);
io.callback_();
}
}
void ThreadedWriter::DispatchIO(const IO& io) {
size_t written = 0;
while (written < io.buf_->Used()) {
Slice data(io.buf_->Data() + written, io_size_);
Status s = io.file_->Append(data);
assert(s.ok());
if (!s.ok()) {
// That is definite IO error to device. There is not much we can
// do but ignore the failure. This can lead to corruption of data on
// disk, but the cache will skip while reading
fprintf(stderr, "Error writing data to file. %s\n", s.ToString().c_str());
}
written += io_size_;
}
}
} // namespace ROCKSDB_NAMESPACE