rocksdb/table/table_reader.h
Hui Xiao 151242ce46 Group rocksdb.sst.read.micros stat by IOActivity flush and compaction (#11288)
Summary:
**Context:**
The existing stat rocksdb.sst.read.micros does not reflect each of compaction and flush cases but aggregate them, which is not so helpful for us to understand IO read behavior of each of them.

**Summary**
- Update `StopWatch` and `RandomAccessFileReader` to record `rocksdb.sst.read.micros` and `rocksdb.file.{flush/compaction}.read.micros`
   - Fixed the default histogram in `RandomAccessFileReader`
- New field `ReadOptions/IOOptions::io_activity`; Pass `ReadOptions` through paths under db open, flush and compaction to where we can prepare `IOOptions` and pass it to `RandomAccessFileReader`
- Use `thread_status_util` for assertion in `DbStressFSWrapper` for continuous testing on we are passing correct `io_activity` under db open, flush and compaction

Pull Request resolved: https://github.com/facebook/rocksdb/pull/11288

Test Plan:
- **Stress test**
- **Db bench 1: rocksdb.sst.read.micros COUNT ≈ sum of rocksdb.file.read.flush.micros's and rocksdb.file.read.compaction.micros's.**  (without blob)
     - May not be exactly the same due to `HistogramStat::Add` only guarantees atomic not accuracy across threads.
```
./db_bench -db=/dev/shm/testdb/ -statistics=true -benchmarks="fillseq" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -target_file_size_base=655 -disable_auto_compactions=false -compression_type=none -bloom_bits=3 (-use_plain_table=1 -prefix_size=10)
```
```
// BlockBasedTable
rocksdb.sst.read.micros P50 : 2.009374 P95 : 4.968548 P99 : 8.110362 P100 : 43.000000 COUNT : 40456 SUM : 114805
rocksdb.file.read.flush.micros P50 : 1.871841 P95 : 3.872407 P99 : 5.540541 P100 : 43.000000 COUNT : 2250 SUM : 6116
rocksdb.file.read.compaction.micros P50 : 2.023109 P95 : 5.029149 P99 : 8.196910 P100 : 26.000000 COUNT : 38206 SUM : 108689

// PlainTable
Does not apply
```
- **Db bench 2: performance**

**Read**

SETUP: db with 900 files
```
./db_bench -db=/dev/shm/testdb/ -benchmarks="fillseq" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655  -disable_auto_compactions=true -target_file_size_base=655 -compression_type=none
```run till convergence
```
./db_bench -seed=1678564177044286 -use_existing_db=true -db=/dev/shm/testdb -benchmarks=readrandom[-X60] -statistics=true -num=1000000 -disable_auto_compactions=true -compression_type=none -bloom_bits=3
```
Pre-change
`readrandom [AVG 60 runs] : 21568 (± 248) ops/sec`
Post-change (no regression, -0.3%)
`readrandom [AVG 60 runs] : 21486 (± 236) ops/sec`

**Compaction/Flush**run till convergence
```
./db_bench -db=/dev/shm/testdb2/ -seed=1678564177044286 -benchmarks="fillseq[-X60]" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655  -disable_auto_compactions=false -target_file_size_base=655 -compression_type=none

rocksdb.sst.read.micros  COUNT : 33820
rocksdb.sst.read.flush.micros COUNT : 1800
rocksdb.sst.read.compaction.micros COUNT : 32020
```
Pre-change
`fillseq [AVG 46 runs] : 1391 (± 214) ops/sec;    0.7 (± 0.1) MB/sec`

Post-change (no regression, ~-0.4%)
`fillseq [AVG 46 runs] : 1385 (± 216) ops/sec;    0.7 (± 0.1) MB/sec`

Reviewed By: ajkr

Differential Revision: D44007011

Pulled By: hx235

fbshipit-source-id: a54c89e4846dfc9a135389edf3f3eedfea257132
2023-04-21 09:07:18 -07:00

188 lines
8 KiB
C++

// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#include <memory>
#include "db/range_tombstone_fragmenter.h"
#if USE_COROUTINES
#include "folly/experimental/coro/Coroutine.h"
#include "folly/experimental/coro/Task.h"
#endif
#include "rocksdb/slice_transform.h"
#include "rocksdb/table_reader_caller.h"
#include "table/get_context.h"
#include "table/internal_iterator.h"
#include "table/multiget_context.h"
namespace ROCKSDB_NAMESPACE {
class Iterator;
struct ParsedInternalKey;
class Slice;
class Arena;
struct ReadOptions;
struct TableProperties;
class GetContext;
class MultiGetContext;
// A Table (also referred to as SST) is a sorted map from strings to strings.
// Tables are immutable and persistent. A Table may be safely accessed from
// multiple threads without external synchronization. Table readers are used
// for reading various types of table formats supported by rocksdb including
// BlockBasedTable, PlainTable and CuckooTable format.
class TableReader {
public:
virtual ~TableReader() {}
// Returns a new iterator over the table contents.
// The result of NewIterator() is initially invalid (caller must
// call one of the Seek methods on the iterator before using it).
//
// read_options: Must outlive the returned iterator.
// arena: If not null, the arena needs to be used to allocate the Iterator.
// When destroying the iterator, the caller will not call "delete"
// but Iterator::~Iterator() directly. The destructor needs to destroy
// all the states but those allocated in arena.
// skip_filters: disables checking the bloom filters even if they exist. This
// option is effective only for block-based table format.
// compaction_readahead_size: its value will only be used if caller =
// kCompaction
virtual InternalIterator* NewIterator(
const ReadOptions& read_options, const SliceTransform* prefix_extractor,
Arena* arena, bool skip_filters, TableReaderCaller caller,
size_t compaction_readahead_size = 0,
bool allow_unprepared_value = false) = 0;
virtual FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator(
const ReadOptions& /*read_options*/) {
return nullptr;
}
// Given a key, return an approximate byte offset in the file where
// the data for that key begins (or would begin if the key were
// present in the file). The returned value is in terms of file
// bytes, and so includes effects like compression of the underlying data.
// E.g., the approximate offset of the last key in the table will
// be close to the file length.
// TODO(peterd): Since this function is only used for approximate size
// from beginning of file, reduce code duplication by removing this
// function and letting ApproximateSize take optional start and end, so
// that absolute start and end can be specified and optimized without
// key / index work.
virtual uint64_t ApproximateOffsetOf(const ReadOptions& read_options,
const Slice& key,
TableReaderCaller caller) = 0;
// Given start and end keys, return the approximate data size in the file
// between the keys. The returned value is in terms of file bytes, and so
// includes effects like compression of the underlying data and applicable
// portions of metadata including filters and indexes. Nullptr for start or
// end (or both) indicates absolute start or end of the table.
virtual uint64_t ApproximateSize(const ReadOptions& read_options,
const Slice& start, const Slice& end,
TableReaderCaller caller) = 0;
struct Anchor {
Anchor(const Slice& _user_key, size_t _range_size)
: user_key(_user_key.ToStringView()), range_size(_range_size) {}
std::string user_key;
size_t range_size;
};
// Now try to return approximately 128 anchor keys.
// The last one tends to be the largest key.
virtual Status ApproximateKeyAnchors(const ReadOptions& /*read_options*/,
std::vector<Anchor>& /*anchors*/) {
return Status::NotSupported("ApproximateKeyAnchors() not supported.");
}
// Set up the table for Compaction. Might change some parameters with
// posix_fadvise
virtual void SetupForCompaction() = 0;
virtual std::shared_ptr<const TableProperties> GetTableProperties() const = 0;
// Prepare work that can be done before the real Get()
virtual void Prepare(const Slice& /*target*/) {}
// Report an approximation of how much memory has been used.
virtual size_t ApproximateMemoryUsage() const = 0;
// Calls get_context->SaveValue() repeatedly, starting with
// the entry found after a call to Seek(key), until it returns false.
// May not make such a call if filter policy says that key is not present.
//
// get_context->MarkKeyMayExist needs to be called when it is configured to be
// memory only and the key is not found in the block cache.
//
// readOptions is the options for the read
// key is the key to search for
// skip_filters: disables checking the bloom filters even if they exist. This
// option is effective only for block-based table format.
virtual Status Get(const ReadOptions& readOptions, const Slice& key,
GetContext* get_context,
const SliceTransform* prefix_extractor,
bool skip_filters = false) = 0;
// Use bloom filters in the table file, if present, to filter out keys. The
// mget_range will be updated to skip keys that get a negative result from
// the filter lookup.
virtual Status MultiGetFilter(const ReadOptions& /*readOptions*/,
const SliceTransform* /*prefix_extractor*/,
MultiGetContext::Range* /*mget_range*/) {
return Status::NotSupported();
}
virtual void MultiGet(const ReadOptions& readOptions,
const MultiGetContext::Range* mget_range,
const SliceTransform* prefix_extractor,
bool skip_filters = false) {
for (auto iter = mget_range->begin(); iter != mget_range->end(); ++iter) {
*iter->s = Get(readOptions, iter->ikey, iter->get_context,
prefix_extractor, skip_filters);
}
}
#if USE_COROUTINES
virtual folly::coro::Task<void> MultiGetCoroutine(
const ReadOptions& readOptions, const MultiGetContext::Range* mget_range,
const SliceTransform* prefix_extractor, bool skip_filters = false) {
MultiGet(readOptions, mget_range, prefix_extractor, skip_filters);
co_return;
}
#endif // USE_COROUTINES
// Prefetch data corresponding to a give range of keys
// Typically this functionality is required for table implementations that
// persists the data on a non volatile storage medium like disk/SSD
virtual Status Prefetch(const ReadOptions& /* read_options */,
const Slice* begin = nullptr,
const Slice* end = nullptr) {
(void)begin;
(void)end;
// Default implementation is NOOP.
// The child class should implement functionality when applicable
return Status::OK();
}
// convert db file to a human readable form
virtual Status DumpTable(WritableFile* /*out_file*/) {
return Status::NotSupported("DumpTable() not supported");
}
// check whether there is corruption in this db file
virtual Status VerifyChecksum(const ReadOptions& /*read_options*/,
TableReaderCaller /*caller*/) {
return Status::NotSupported("VerifyChecksum() not supported");
}
};
} // namespace ROCKSDB_NAMESPACE