2020-08-28 01:15:11 +00:00
|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
|
|
|
|
#include "db/db_test_util.h"
|
2023-01-13 02:09:07 +00:00
|
|
|
#include "file/file_prefetch_buffer.h"
|
|
|
|
#include "file/file_util.h"
|
|
|
|
#include "rocksdb/file_system.h"
|
2020-08-28 01:15:11 +00:00
|
|
|
#include "test_util/sync_point.h"
|
Provide support for IOTracing for ReadAsync API (#9833)
Summary:
Same as title
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9833
Test Plan:
Add unit test and manually check the output of tracing logs
For fixed readahead_size it logs as:
```
Access Time : 193352113447923 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 15075 , IO Status: OK, Length: 12288, Offset: 659456
Access Time : 193352113465232 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 14425 , IO Status: OK, Length: 12288, Offset: 671744
Access Time : 193352113481539 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13062 , IO Status: OK, Length: 12288, Offset: 684032
Access Time : 193352113497692 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13649 , IO Status: OK, Length: 12288, Offset: 696320
Access Time : 193352113520043 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 19384 , IO Status: OK, Length: 12288, Offset: 708608
Access Time : 193352113538401 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 15406 , IO Status: OK, Length: 12288, Offset: 720896
Access Time : 193352113554855 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13670 , IO Status: OK, Length: 12288, Offset: 733184
Access Time : 193352113571624 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13855 , IO Status: OK, Length: 12288, Offset: 745472
Access Time : 193352113587924 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13953 , IO Status: OK, Length: 12288, Offset: 757760
Access Time : 193352113603285 , File Name: 000026.sst , File Operation: Prefetch , Latency: 59 , IO Status: Not implemented: Prefetch not supported, Length: 8868, Offset: 898349
```
For implicit readahead:
```
Access Time : 193351865156587 , File Name: 000026.sst , File Operation: Prefetch , Latency: 48 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 391174
Access Time : 193351865160354 , File Name: 000026.sst , File Operation: Prefetch , Latency: 51 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 395248
Access Time : 193351865164253 , File Name: 000026.sst , File Operation: Prefetch , Latency: 49 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 399322
Access Time : 193351865165461 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 222871 , IO Status: OK, Length: 135168, Offset: 401408
```
Reviewed By: anand1976
Differential Revision: D35601634
Pulled By: akankshamahajan15
fbshipit-source-id: 5a4f32a850af878efa0767bd5706380152a1f26e
2022-05-26 02:47:03 +00:00
|
|
|
#ifdef GFLAGS
|
|
|
|
#include "tools/io_tracer_parser_tool.h"
|
|
|
|
#endif
|
2024-10-17 22:52:55 +00:00
|
|
|
#include "rocksdb/flush_block_policy.h"
|
2023-01-13 02:09:07 +00:00
|
|
|
#include "util/random.h"
|
2020-08-28 01:15:11 +00:00
|
|
|
|
2023-03-17 21:57:09 +00:00
|
|
|
namespace {
|
|
|
|
static bool enable_io_uring = true;
|
|
|
|
extern "C" bool RocksDbIOUringEnable() { return enable_io_uring; }
|
|
|
|
} // namespace
|
|
|
|
|
2020-08-28 01:15:11 +00:00
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
|
|
|
|
class MockFS;
|
|
|
|
|
2021-09-13 15:45:13 +00:00
|
|
|
class MockRandomAccessFile : public FSRandomAccessFileOwnerWrapper {
|
2020-08-28 01:15:11 +00:00
|
|
|
public:
|
|
|
|
MockRandomAccessFile(std::unique_ptr<FSRandomAccessFile>& file,
|
2023-06-16 20:04:30 +00:00
|
|
|
bool support_prefetch, std::atomic_int& prefetch_count,
|
|
|
|
bool small_buffer_alignment = false)
|
2021-09-13 15:45:13 +00:00
|
|
|
: FSRandomAccessFileOwnerWrapper(std::move(file)),
|
2020-08-28 01:15:11 +00:00
|
|
|
support_prefetch_(support_prefetch),
|
2023-06-16 20:04:30 +00:00
|
|
|
prefetch_count_(prefetch_count),
|
|
|
|
small_buffer_alignment_(small_buffer_alignment) {}
|
2020-08-28 01:15:11 +00:00
|
|
|
|
|
|
|
IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& options,
|
|
|
|
IODebugContext* dbg) override {
|
|
|
|
if (support_prefetch_) {
|
|
|
|
prefetch_count_.fetch_add(1);
|
|
|
|
return target()->Prefetch(offset, n, options, dbg);
|
|
|
|
} else {
|
Fix many tests to run with MEM_ENV and ENCRYPTED_ENV; Introduce a MemoryFileSystem class (#7566)
Summary:
This PR does a few things:
1. The MockFileSystem class was split out from the MockEnv. This change would theoretically allow a MockFileSystem to be used by other Environments as well (if we created a means of constructing one). The MockFileSystem implements a FileSystem in its entirety and does not rely on any Wrapper implementation.
2. Make the RocksDB test suite work when MOCK_ENV=1 and ENCRYPTED_ENV=1 are set. To accomplish this, a few things were needed:
- The tests that tried to use the "wrong" environment (Env::Default() instead of env_) were updated
- The MockFileSystem was changed to support the features it was missing or mishandled (such as recursively deleting files in a directory or supporting renaming of a directory).
3. Updated the test framework to have a ROCKSDB_GTEST_SKIP macro. This can be used to flag tests that are skipped. Currently, this defaults to doing nothing (marks the test as SUCCESS) but will mark the tests as SKIPPED when RocksDB is upgraded to a version of gtest that supports this (gtest-1.10).
I have run a full "make check" with MEM_ENV, ENCRYPTED_ENV, both, and neither under both MacOS and RedHat. A few tests were disabled/skipped for the MEM/ENCRYPTED cases. The error_handler_fs_test fails/hangs for MEM_ENV (presumably a timing problem) and I will introduce another PR/issue to track that problem. (I will also push a change to disable those tests soon). There is one more test in DBTest2 that also fails which I need to investigate or skip before this PR is merged.
Theoretically, this PR should also allow the test suite to run against an Env loaded from the registry, though I do not have one to try it with currently.
Finally, once this is accepted, it would be nice if there was a CircleCI job to run these tests on a checkin so this effort does not become stale. I do not know how to do that, so if someone could write that job, it would be appreciated :)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7566
Reviewed By: zhichao-cao
Differential Revision: D24408980
Pulled By: jay-zhuang
fbshipit-source-id: 911b1554a4d0da06fd51feca0c090a4abdcb4a5f
2020-10-27 17:31:34 +00:00
|
|
|
return IOStatus::NotSupported("Prefetch not supported");
|
2020-08-28 01:15:11 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-06-16 20:04:30 +00:00
|
|
|
size_t GetRequiredBufferAlignment() const override {
|
|
|
|
return small_buffer_alignment_
|
|
|
|
? 1
|
|
|
|
: FSRandomAccessFileOwnerWrapper::GetRequiredBufferAlignment();
|
|
|
|
}
|
|
|
|
|
2020-08-28 01:15:11 +00:00
|
|
|
private:
|
|
|
|
const bool support_prefetch_;
|
|
|
|
std::atomic_int& prefetch_count_;
|
2023-06-16 20:04:30 +00:00
|
|
|
const bool small_buffer_alignment_;
|
2020-08-28 01:15:11 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
class MockFS : public FileSystemWrapper {
|
|
|
|
public:
|
Fix many tests to run with MEM_ENV and ENCRYPTED_ENV; Introduce a MemoryFileSystem class (#7566)
Summary:
This PR does a few things:
1. The MockFileSystem class was split out from the MockEnv. This change would theoretically allow a MockFileSystem to be used by other Environments as well (if we created a means of constructing one). The MockFileSystem implements a FileSystem in its entirety and does not rely on any Wrapper implementation.
2. Make the RocksDB test suite work when MOCK_ENV=1 and ENCRYPTED_ENV=1 are set. To accomplish this, a few things were needed:
- The tests that tried to use the "wrong" environment (Env::Default() instead of env_) were updated
- The MockFileSystem was changed to support the features it was missing or mishandled (such as recursively deleting files in a directory or supporting renaming of a directory).
3. Updated the test framework to have a ROCKSDB_GTEST_SKIP macro. This can be used to flag tests that are skipped. Currently, this defaults to doing nothing (marks the test as SUCCESS) but will mark the tests as SKIPPED when RocksDB is upgraded to a version of gtest that supports this (gtest-1.10).
I have run a full "make check" with MEM_ENV, ENCRYPTED_ENV, both, and neither under both MacOS and RedHat. A few tests were disabled/skipped for the MEM/ENCRYPTED cases. The error_handler_fs_test fails/hangs for MEM_ENV (presumably a timing problem) and I will introduce another PR/issue to track that problem. (I will also push a change to disable those tests soon). There is one more test in DBTest2 that also fails which I need to investigate or skip before this PR is merged.
Theoretically, this PR should also allow the test suite to run against an Env loaded from the registry, though I do not have one to try it with currently.
Finally, once this is accepted, it would be nice if there was a CircleCI job to run these tests on a checkin so this effort does not become stale. I do not know how to do that, so if someone could write that job, it would be appreciated :)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7566
Reviewed By: zhichao-cao
Differential Revision: D24408980
Pulled By: jay-zhuang
fbshipit-source-id: 911b1554a4d0da06fd51feca0c090a4abdcb4a5f
2020-10-27 17:31:34 +00:00
|
|
|
explicit MockFS(const std::shared_ptr<FileSystem>& wrapped,
|
2023-06-16 20:04:30 +00:00
|
|
|
bool support_prefetch, bool small_buffer_alignment = false)
|
|
|
|
: FileSystemWrapper(wrapped),
|
|
|
|
support_prefetch_(support_prefetch),
|
|
|
|
small_buffer_alignment_(small_buffer_alignment) {}
|
2020-08-28 01:15:11 +00:00
|
|
|
|
2021-11-02 16:06:02 +00:00
|
|
|
static const char* kClassName() { return "MockFS"; }
|
|
|
|
const char* Name() const override { return kClassName(); }
|
|
|
|
|
2020-08-28 01:15:11 +00:00
|
|
|
IOStatus NewRandomAccessFile(const std::string& fname,
|
|
|
|
const FileOptions& opts,
|
|
|
|
std::unique_ptr<FSRandomAccessFile>* result,
|
|
|
|
IODebugContext* dbg) override {
|
|
|
|
std::unique_ptr<FSRandomAccessFile> file;
|
|
|
|
IOStatus s;
|
|
|
|
s = target()->NewRandomAccessFile(fname, opts, &file, dbg);
|
2023-06-16 20:04:30 +00:00
|
|
|
result->reset(new MockRandomAccessFile(
|
|
|
|
file, support_prefetch_, prefetch_count_, small_buffer_alignment_));
|
2020-08-28 01:15:11 +00:00
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
void ClearPrefetchCount() { prefetch_count_ = 0; }
|
|
|
|
|
|
|
|
bool IsPrefetchCalled() { return prefetch_count_ > 0; }
|
|
|
|
|
2021-04-28 19:52:53 +00:00
|
|
|
int GetPrefetchCount() {
|
|
|
|
return prefetch_count_.load(std::memory_order_relaxed);
|
|
|
|
}
|
|
|
|
|
2020-08-28 01:15:11 +00:00
|
|
|
private:
|
|
|
|
const bool support_prefetch_;
|
2023-06-16 20:04:30 +00:00
|
|
|
const bool small_buffer_alignment_;
|
2020-08-28 01:15:11 +00:00
|
|
|
std::atomic_int prefetch_count_{0};
|
|
|
|
};
|
|
|
|
|
|
|
|
class PrefetchTest
|
|
|
|
: public DBTestBase,
|
|
|
|
public ::testing::WithParamInterface<std::tuple<bool, bool>> {
|
|
|
|
public:
|
2021-07-23 15:37:27 +00:00
|
|
|
PrefetchTest() : DBTestBase("prefetch_test", true) {}
|
2023-01-20 18:17:57 +00:00
|
|
|
|
2023-06-16 20:04:30 +00:00
|
|
|
virtual void SetGenericOptions(Env* env, bool use_direct_io,
|
|
|
|
Options& options) {
|
2024-06-19 16:53:59 +00:00
|
|
|
anon::OptionsOverride options_override;
|
|
|
|
// for !disable_io in PrefetchTest.Basic
|
|
|
|
options_override.full_block_cache = true;
|
|
|
|
options = CurrentOptions(options_override);
|
2023-01-20 18:17:57 +00:00
|
|
|
options.write_buffer_size = 1024;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
options.compression = kNoCompression;
|
|
|
|
options.env = env;
|
|
|
|
options.disable_auto_compactions = true;
|
|
|
|
if (use_direct_io) {
|
|
|
|
options.use_direct_reads = true;
|
|
|
|
options.use_direct_io_for_flush_and_compaction = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void SetBlockBasedTableOptions(BlockBasedTableOptions& table_options) {
|
|
|
|
table_options.no_block_cache = true;
|
|
|
|
table_options.cache_index_and_filter_blocks = false;
|
|
|
|
table_options.metadata_block_size = 1024;
|
|
|
|
table_options.index_type =
|
|
|
|
BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
|
|
|
|
}
|
2024-10-17 22:52:55 +00:00
|
|
|
|
|
|
|
void VerifyScan(ReadOptions& iter_ro, ReadOptions& cmp_iter_ro,
|
|
|
|
const Slice* seek_key, const Slice* iterate_upper_bound,
|
|
|
|
bool prefix_same_as_start) const {
|
|
|
|
assert(!(seek_key == nullptr));
|
|
|
|
iter_ro.iterate_upper_bound = cmp_iter_ro.iterate_upper_bound =
|
|
|
|
iterate_upper_bound;
|
|
|
|
iter_ro.prefix_same_as_start = cmp_iter_ro.prefix_same_as_start =
|
|
|
|
prefix_same_as_start;
|
|
|
|
|
|
|
|
auto iter = std::unique_ptr<Iterator>(db_->NewIterator(iter_ro));
|
|
|
|
auto cmp_iter = std::unique_ptr<Iterator>(db_->NewIterator(cmp_iter_ro));
|
|
|
|
|
|
|
|
iter->Seek(*seek_key);
|
|
|
|
cmp_iter->Seek(*seek_key);
|
|
|
|
|
|
|
|
while (iter->Valid() && cmp_iter->Valid()) {
|
|
|
|
if (iter->key() != cmp_iter->key()) {
|
|
|
|
// Error
|
|
|
|
ASSERT_TRUE(false);
|
|
|
|
}
|
|
|
|
iter->Next();
|
|
|
|
cmp_iter->Next();
|
|
|
|
}
|
|
|
|
|
|
|
|
ASSERT_TRUE(!cmp_iter->Valid() && !iter->Valid());
|
|
|
|
ASSERT_TRUE(cmp_iter->status().ok() && iter->status().ok());
|
|
|
|
}
|
|
|
|
|
|
|
|
void VerifySeekPrevSeek(ReadOptions& iter_ro, ReadOptions& cmp_iter_ro,
|
|
|
|
const Slice* seek_key,
|
|
|
|
const Slice* iterate_upper_bound,
|
|
|
|
bool prefix_same_as_start) {
|
|
|
|
assert(!(seek_key == nullptr));
|
|
|
|
iter_ro.iterate_upper_bound = cmp_iter_ro.iterate_upper_bound =
|
|
|
|
iterate_upper_bound;
|
|
|
|
iter_ro.prefix_same_as_start = cmp_iter_ro.prefix_same_as_start =
|
|
|
|
prefix_same_as_start;
|
|
|
|
|
|
|
|
auto iter = std::unique_ptr<Iterator>(db_->NewIterator(iter_ro));
|
|
|
|
auto cmp_iter = std::unique_ptr<Iterator>(db_->NewIterator(cmp_iter_ro));
|
|
|
|
|
|
|
|
// Seek
|
|
|
|
cmp_iter->Seek(*seek_key);
|
|
|
|
ASSERT_TRUE(cmp_iter->Valid());
|
|
|
|
ASSERT_OK(cmp_iter->status());
|
|
|
|
|
|
|
|
iter->Seek(*seek_key);
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
|
|
|
|
ASSERT_EQ(iter->key(), cmp_iter->key());
|
|
|
|
|
|
|
|
// Prev op should pass
|
|
|
|
cmp_iter->Prev();
|
|
|
|
ASSERT_TRUE(cmp_iter->Valid());
|
|
|
|
ASSERT_OK(cmp_iter->status());
|
|
|
|
|
|
|
|
iter->Prev();
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
|
|
|
|
ASSERT_EQ(iter->key(), cmp_iter->key());
|
|
|
|
|
|
|
|
// Reseek would follow as usual
|
|
|
|
cmp_iter->Seek(*seek_key);
|
|
|
|
ASSERT_TRUE(cmp_iter->Valid());
|
|
|
|
ASSERT_OK(cmp_iter->status());
|
|
|
|
|
|
|
|
iter->Seek(*seek_key);
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
|
|
|
|
ASSERT_EQ(iter->key(), cmp_iter->key());
|
|
|
|
}
|
2020-08-28 01:15:11 +00:00
|
|
|
};
|
|
|
|
|
2021-04-28 19:52:53 +00:00
|
|
|
INSTANTIATE_TEST_CASE_P(PrefetchTest, PrefetchTest,
|
|
|
|
::testing::Combine(::testing::Bool(),
|
|
|
|
::testing::Bool()));
|
|
|
|
|
2020-08-28 01:15:11 +00:00
|
|
|
std::string BuildKey(int num, std::string postfix = "") {
|
|
|
|
return "my_key_" + std::to_string(num) + postfix;
|
|
|
|
}
|
|
|
|
|
2023-07-21 21:52:52 +00:00
|
|
|
// This test verifies the following basic functionalities of prefetching:
|
|
|
|
// (1) If underline file system supports prefetch, and directIO is not enabled
|
|
|
|
// make sure prefetch() is called and FilePrefetchBuffer is not used.
|
|
|
|
// (2) If underline file system doesn't support prefetch, or directIO is
|
|
|
|
// enabled, make sure prefetch() is not called and FilePrefetchBuffer is
|
|
|
|
// used.
|
|
|
|
// (3) Measure read bytes, hit and miss of SST's tail prefetching during table
|
|
|
|
// open.
|
2020-08-28 01:15:11 +00:00
|
|
|
TEST_P(PrefetchTest, Basic) {
|
|
|
|
// First param is if the mockFS support_prefetch or not
|
2021-01-06 18:48:24 +00:00
|
|
|
bool support_prefetch =
|
|
|
|
std::get<0>(GetParam()) &&
|
|
|
|
test::IsPrefetchSupported(env_->GetFileSystem(), dbname_);
|
2023-01-20 18:17:57 +00:00
|
|
|
std::shared_ptr<MockFS> fs =
|
|
|
|
std::make_shared<MockFS>(env_->GetFileSystem(), support_prefetch);
|
2020-08-28 01:15:11 +00:00
|
|
|
|
|
|
|
// Second param is if directIO is enabled or not
|
|
|
|
bool use_direct_io = std::get<1>(GetParam());
|
2023-01-20 18:17:57 +00:00
|
|
|
|
2020-08-28 01:15:11 +00:00
|
|
|
std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
|
2023-01-20 18:17:57 +00:00
|
|
|
Options options;
|
|
|
|
SetGenericOptions(env.get(), use_direct_io, options);
|
Add new stat rocksdb.table.open.prefetch.tail.read.bytes, rocksdb.table.open.prefetch.tail.{miss|hit} (#11265)
Summary:
**Context/Summary:**
We are adding new stats to measure behavior of prefetched tail size and look up into this buffer
The stat collection is done in FilePrefetchBuffer but only for prefetched tail buffer during table open for now using FilePrefetchBuffer enum. It's cleaner than the alternative of implementing in upper-level call places of FilePrefetchBuffer for table open. It also has the benefit of extensible to other types of FilePrefetchBuffer if needed. See db bench for perf regression concern.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11265
Test Plan:
**- Piggyback on existing test**
**- rocksdb.table.open.prefetch.tail.miss is harder to UT so I manually set prefetch tail read bytes to be small and run db bench.**
```
./db_bench -db=/tmp/testdb -statistics=true -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000 -write_buffer_size=655 -target_file_size_base=655 -disable_auto_compactions=false -compression_type=none -bloom_bits=3 -use_direct_reads=true
```
```
rocksdb.table.open.prefetch.tail.read.bytes P50 : 4096.000000 P95 : 4096.000000 P99 : 4096.000000 P100 : 4096.000000 COUNT : 225 SUM : 921600
rocksdb.table.open.prefetch.tail.miss COUNT : 91
rocksdb.table.open.prefetch.tail.hit COUNT : 1034
```
**- No perf regression observed in db_bench**
SETUP command: create same db with ~900 files for pre-change/post-change.
```
./db_bench -db=/tmp/testdb -benchmarks="fillseq" -key_size=32 -value_size=512 -num=500000 -write_buffer_size=655360 -disable_auto_compactions=true -target_file_size_base=16777216 -compression_type=none
```
TEST command 60 runs or til convergence: as suggested by anand1976 and akankshamahajan15, vary `seek_nexts` and `async_io` in testing.
```
./db_bench -use_existing_db=true -db=/tmp/testdb -statistics=false -cache_size=0 -cache_index_and_filter_blocks=false -benchmarks=seekrandom[-X60] -num=50000 -seek_nexts={10, 500, 1000} -async_io={0|1} -use_direct_reads=true
```
async io = 0, direct io read = true
| seek_nexts = 10, 30 runs | seek_nexts = 500, 12 runs | seek_nexts = 1000, 6 runs
-- | -- | -- | --
pre-post change | 4776 (± 28) ops/sec; 24.8 (± 0.1) MB/sec | 288 (± 1) ops/sec; 74.8 (± 0.4) MB/sec | 145 (± 4) ops/sec; 75.6 (± 2.2) MB/sec
post-change | 4790 (± 32) ops/sec; 24.9 (± 0.2) MB/sec | 288 (± 3) ops/sec; 74.7 (± 0.8) MB/sec | 143 (± 3) ops/sec; 74.5 (± 1.6) MB/sec
async io = 1, direct io read = true
| seek_nexts = 10, 54 runs | seek_nexts = 500, 6 runs | seek_nexts = 1000, 4 runs
-- | -- | -- | --
pre-post change | 3350 (± 36) ops/sec; 17.4 (± 0.2) MB/sec | 264 (± 0) ops/sec; 68.7 (± 0.2) MB/sec | 138 (± 1) ops/sec; 71.8 (± 1.0) MB/sec
post-change | 3358 (± 27) ops/sec; 17.4 (± 0.1) MB/sec | 263 (± 2) ops/sec; 68.3 (± 0.8) MB/sec | 139 (± 1) ops/sec; 72.6 (± 0.6) MB/sec
Reviewed By: ajkr
Differential Revision: D43781467
Pulled By: hx235
fbshipit-source-id: a706a18472a8edb2b952bac3af40eec803537f2a
2023-03-15 21:02:43 +00:00
|
|
|
options.statistics = CreateDBStatistics();
|
2020-08-28 01:15:11 +00:00
|
|
|
|
2023-01-20 18:17:57 +00:00
|
|
|
const int kNumKeys = 1100;
|
2020-08-28 01:15:11 +00:00
|
|
|
int buff_prefetch_count = 0;
|
|
|
|
SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start",
|
|
|
|
[&](void*) { buff_prefetch_count++; });
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
Status s = TryReopen(options);
|
|
|
|
if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) {
|
|
|
|
// If direct IO is not supported, skip the test
|
|
|
|
return;
|
|
|
|
} else {
|
|
|
|
ASSERT_OK(s);
|
|
|
|
}
|
|
|
|
|
|
|
|
// create first key range
|
|
|
|
WriteBatch batch;
|
|
|
|
for (int i = 0; i < kNumKeys; i++) {
|
2023-08-19 00:47:22 +00:00
|
|
|
ASSERT_OK(batch.Put(BuildKey(i), "v1"));
|
2020-08-28 01:15:11 +00:00
|
|
|
}
|
|
|
|
ASSERT_OK(db_->Write(WriteOptions(), &batch));
|
2023-08-19 00:47:22 +00:00
|
|
|
ASSERT_OK(db_->Flush(FlushOptions()));
|
2020-08-28 01:15:11 +00:00
|
|
|
|
|
|
|
// create second key range
|
|
|
|
batch.Clear();
|
|
|
|
for (int i = 0; i < kNumKeys; i++) {
|
2023-08-19 00:47:22 +00:00
|
|
|
ASSERT_OK(batch.Put(BuildKey(i, "key2"), "v2"));
|
2020-08-28 01:15:11 +00:00
|
|
|
}
|
|
|
|
ASSERT_OK(db_->Write(WriteOptions(), &batch));
|
2023-08-19 00:47:22 +00:00
|
|
|
ASSERT_OK(db_->Flush(FlushOptions()));
|
2020-08-28 01:15:11 +00:00
|
|
|
|
|
|
|
// delete second key range
|
|
|
|
batch.Clear();
|
|
|
|
for (int i = 0; i < kNumKeys; i++) {
|
2020-12-22 18:29:58 +00:00
|
|
|
ASSERT_OK(batch.Delete(BuildKey(i, "key2")));
|
2020-08-28 01:15:11 +00:00
|
|
|
}
|
|
|
|
ASSERT_OK(db_->Write(WriteOptions(), &batch));
|
2023-07-21 21:52:52 +00:00
|
|
|
ASSERT_OK(db_->Flush(FlushOptions()));
|
|
|
|
|
2023-08-19 00:47:22 +00:00
|
|
|
std::vector<LiveFileMetaData> metadata;
|
|
|
|
db_->GetLiveFilesMetaData(&metadata);
|
|
|
|
const size_t num_file = metadata.size();
|
2023-07-21 21:52:52 +00:00
|
|
|
// To verify SST file tail prefetch (once per file) during flush output
|
|
|
|
// verification
|
|
|
|
if (support_prefetch && !use_direct_io) {
|
|
|
|
ASSERT_TRUE(fs->IsPrefetchCalled());
|
2023-08-19 00:47:22 +00:00
|
|
|
ASSERT_EQ(num_file, fs->GetPrefetchCount());
|
2023-07-21 21:52:52 +00:00
|
|
|
ASSERT_EQ(0, buff_prefetch_count);
|
|
|
|
fs->ClearPrefetchCount();
|
|
|
|
} else {
|
|
|
|
ASSERT_FALSE(fs->IsPrefetchCalled());
|
2023-08-19 00:47:22 +00:00
|
|
|
ASSERT_EQ(buff_prefetch_count, num_file);
|
2023-07-21 21:52:52 +00:00
|
|
|
buff_prefetch_count = 0;
|
|
|
|
}
|
2020-08-28 01:15:11 +00:00
|
|
|
|
|
|
|
// compact database
|
|
|
|
std::string start_key = BuildKey(0);
|
|
|
|
std::string end_key = BuildKey(kNumKeys - 1);
|
|
|
|
Slice least(start_key.data(), start_key.size());
|
|
|
|
Slice greatest(end_key.data(), end_key.size());
|
|
|
|
|
Add new stat rocksdb.table.open.prefetch.tail.read.bytes, rocksdb.table.open.prefetch.tail.{miss|hit} (#11265)
Summary:
**Context/Summary:**
We are adding new stats to measure behavior of prefetched tail size and look up into this buffer
The stat collection is done in FilePrefetchBuffer but only for prefetched tail buffer during table open for now using FilePrefetchBuffer enum. It's cleaner than the alternative of implementing in upper-level call places of FilePrefetchBuffer for table open. It also has the benefit of extensible to other types of FilePrefetchBuffer if needed. See db bench for perf regression concern.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11265
Test Plan:
**- Piggyback on existing test**
**- rocksdb.table.open.prefetch.tail.miss is harder to UT so I manually set prefetch tail read bytes to be small and run db bench.**
```
./db_bench -db=/tmp/testdb -statistics=true -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000 -write_buffer_size=655 -target_file_size_base=655 -disable_auto_compactions=false -compression_type=none -bloom_bits=3 -use_direct_reads=true
```
```
rocksdb.table.open.prefetch.tail.read.bytes P50 : 4096.000000 P95 : 4096.000000 P99 : 4096.000000 P100 : 4096.000000 COUNT : 225 SUM : 921600
rocksdb.table.open.prefetch.tail.miss COUNT : 91
rocksdb.table.open.prefetch.tail.hit COUNT : 1034
```
**- No perf regression observed in db_bench**
SETUP command: create same db with ~900 files for pre-change/post-change.
```
./db_bench -db=/tmp/testdb -benchmarks="fillseq" -key_size=32 -value_size=512 -num=500000 -write_buffer_size=655360 -disable_auto_compactions=true -target_file_size_base=16777216 -compression_type=none
```
TEST command 60 runs or til convergence: as suggested by anand1976 and akankshamahajan15, vary `seek_nexts` and `async_io` in testing.
```
./db_bench -use_existing_db=true -db=/tmp/testdb -statistics=false -cache_size=0 -cache_index_and_filter_blocks=false -benchmarks=seekrandom[-X60] -num=50000 -seek_nexts={10, 500, 1000} -async_io={0|1} -use_direct_reads=true
```
async io = 0, direct io read = true
| seek_nexts = 10, 30 runs | seek_nexts = 500, 12 runs | seek_nexts = 1000, 6 runs
-- | -- | -- | --
pre-post change | 4776 (± 28) ops/sec; 24.8 (± 0.1) MB/sec | 288 (± 1) ops/sec; 74.8 (± 0.4) MB/sec | 145 (± 4) ops/sec; 75.6 (± 2.2) MB/sec
post-change | 4790 (± 32) ops/sec; 24.9 (± 0.2) MB/sec | 288 (± 3) ops/sec; 74.7 (± 0.8) MB/sec | 143 (± 3) ops/sec; 74.5 (± 1.6) MB/sec
async io = 1, direct io read = true
| seek_nexts = 10, 54 runs | seek_nexts = 500, 6 runs | seek_nexts = 1000, 4 runs
-- | -- | -- | --
pre-post change | 3350 (± 36) ops/sec; 17.4 (± 0.2) MB/sec | 264 (± 0) ops/sec; 68.7 (± 0.2) MB/sec | 138 (± 1) ops/sec; 71.8 (± 1.0) MB/sec
post-change | 3358 (± 27) ops/sec; 17.4 (± 0.1) MB/sec | 263 (± 2) ops/sec; 68.3 (± 0.8) MB/sec | 139 (± 1) ops/sec; 72.6 (± 0.6) MB/sec
Reviewed By: ajkr
Differential Revision: D43781467
Pulled By: hx235
fbshipit-source-id: a706a18472a8edb2b952bac3af40eec803537f2a
2023-03-15 21:02:43 +00:00
|
|
|
HistogramData prev_table_open_prefetch_tail_read;
|
|
|
|
options.statistics->histogramData(TABLE_OPEN_PREFETCH_TAIL_READ_BYTES,
|
|
|
|
&prev_table_open_prefetch_tail_read);
|
|
|
|
const uint64_t prev_table_open_prefetch_tail_miss =
|
|
|
|
options.statistics->getTickerCount(TABLE_OPEN_PREFETCH_TAIL_MISS);
|
|
|
|
const uint64_t prev_table_open_prefetch_tail_hit =
|
|
|
|
options.statistics->getTickerCount(TABLE_OPEN_PREFETCH_TAIL_HIT);
|
|
|
|
|
2020-08-28 01:15:11 +00:00
|
|
|
// commenting out the line below causes the example to work correctly
|
2020-12-22 18:29:58 +00:00
|
|
|
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest));
|
2020-08-28 01:15:11 +00:00
|
|
|
|
Add new stat rocksdb.table.open.prefetch.tail.read.bytes, rocksdb.table.open.prefetch.tail.{miss|hit} (#11265)
Summary:
**Context/Summary:**
We are adding new stats to measure behavior of prefetched tail size and look up into this buffer
The stat collection is done in FilePrefetchBuffer but only for prefetched tail buffer during table open for now using FilePrefetchBuffer enum. It's cleaner than the alternative of implementing in upper-level call places of FilePrefetchBuffer for table open. It also has the benefit of extensible to other types of FilePrefetchBuffer if needed. See db bench for perf regression concern.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11265
Test Plan:
**- Piggyback on existing test**
**- rocksdb.table.open.prefetch.tail.miss is harder to UT so I manually set prefetch tail read bytes to be small and run db bench.**
```
./db_bench -db=/tmp/testdb -statistics=true -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000 -write_buffer_size=655 -target_file_size_base=655 -disable_auto_compactions=false -compression_type=none -bloom_bits=3 -use_direct_reads=true
```
```
rocksdb.table.open.prefetch.tail.read.bytes P50 : 4096.000000 P95 : 4096.000000 P99 : 4096.000000 P100 : 4096.000000 COUNT : 225 SUM : 921600
rocksdb.table.open.prefetch.tail.miss COUNT : 91
rocksdb.table.open.prefetch.tail.hit COUNT : 1034
```
**- No perf regression observed in db_bench**
SETUP command: create same db with ~900 files for pre-change/post-change.
```
./db_bench -db=/tmp/testdb -benchmarks="fillseq" -key_size=32 -value_size=512 -num=500000 -write_buffer_size=655360 -disable_auto_compactions=true -target_file_size_base=16777216 -compression_type=none
```
TEST command 60 runs or til convergence: as suggested by anand1976 and akankshamahajan15, vary `seek_nexts` and `async_io` in testing.
```
./db_bench -use_existing_db=true -db=/tmp/testdb -statistics=false -cache_size=0 -cache_index_and_filter_blocks=false -benchmarks=seekrandom[-X60] -num=50000 -seek_nexts={10, 500, 1000} -async_io={0|1} -use_direct_reads=true
```
async io = 0, direct io read = true
| seek_nexts = 10, 30 runs | seek_nexts = 500, 12 runs | seek_nexts = 1000, 6 runs
-- | -- | -- | --
pre-post change | 4776 (± 28) ops/sec; 24.8 (± 0.1) MB/sec | 288 (± 1) ops/sec; 74.8 (± 0.4) MB/sec | 145 (± 4) ops/sec; 75.6 (± 2.2) MB/sec
post-change | 4790 (± 32) ops/sec; 24.9 (± 0.2) MB/sec | 288 (± 3) ops/sec; 74.7 (± 0.8) MB/sec | 143 (± 3) ops/sec; 74.5 (± 1.6) MB/sec
async io = 1, direct io read = true
| seek_nexts = 10, 54 runs | seek_nexts = 500, 6 runs | seek_nexts = 1000, 4 runs
-- | -- | -- | --
pre-post change | 3350 (± 36) ops/sec; 17.4 (± 0.2) MB/sec | 264 (± 0) ops/sec; 68.7 (± 0.2) MB/sec | 138 (± 1) ops/sec; 71.8 (± 1.0) MB/sec
post-change | 3358 (± 27) ops/sec; 17.4 (± 0.1) MB/sec | 263 (± 2) ops/sec; 68.3 (± 0.8) MB/sec | 139 (± 1) ops/sec; 72.6 (± 0.6) MB/sec
Reviewed By: ajkr
Differential Revision: D43781467
Pulled By: hx235
fbshipit-source-id: a706a18472a8edb2b952bac3af40eec803537f2a
2023-03-15 21:02:43 +00:00
|
|
|
HistogramData cur_table_open_prefetch_tail_read;
|
|
|
|
options.statistics->histogramData(TABLE_OPEN_PREFETCH_TAIL_READ_BYTES,
|
|
|
|
&cur_table_open_prefetch_tail_read);
|
|
|
|
const uint64_t cur_table_open_prefetch_tail_miss =
|
|
|
|
options.statistics->getTickerCount(TABLE_OPEN_PREFETCH_TAIL_MISS);
|
|
|
|
const uint64_t cur_table_open_prefetch_tail_hit =
|
|
|
|
options.statistics->getTickerCount(TABLE_OPEN_PREFETCH_TAIL_HIT);
|
|
|
|
|
2023-07-21 21:52:52 +00:00
|
|
|
// To verify prefetch during compaction input read
|
2020-08-28 01:15:11 +00:00
|
|
|
if (support_prefetch && !use_direct_io) {
|
|
|
|
ASSERT_TRUE(fs->IsPrefetchCalled());
|
2023-07-21 21:52:52 +00:00
|
|
|
// To rule out false positive by the SST file tail prefetch during
|
|
|
|
// compaction output verification
|
|
|
|
ASSERT_GT(fs->GetPrefetchCount(), 1);
|
2020-08-28 01:15:11 +00:00
|
|
|
ASSERT_EQ(0, buff_prefetch_count);
|
2023-07-21 21:52:52 +00:00
|
|
|
fs->ClearPrefetchCount();
|
2020-08-28 01:15:11 +00:00
|
|
|
} else {
|
|
|
|
ASSERT_FALSE(fs->IsPrefetchCalled());
|
2023-08-04 00:23:02 +00:00
|
|
|
// To rule out false positive by the SST file tail prefetch during
|
|
|
|
// compaction output verification
|
|
|
|
ASSERT_GT(buff_prefetch_count, 1);
|
2023-07-21 21:52:52 +00:00
|
|
|
buff_prefetch_count = 0;
|
|
|
|
|
Add new stat rocksdb.table.open.prefetch.tail.read.bytes, rocksdb.table.open.prefetch.tail.{miss|hit} (#11265)
Summary:
**Context/Summary:**
We are adding new stats to measure behavior of prefetched tail size and look up into this buffer
The stat collection is done in FilePrefetchBuffer but only for prefetched tail buffer during table open for now using FilePrefetchBuffer enum. It's cleaner than the alternative of implementing in upper-level call places of FilePrefetchBuffer for table open. It also has the benefit of extensible to other types of FilePrefetchBuffer if needed. See db bench for perf regression concern.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11265
Test Plan:
**- Piggyback on existing test**
**- rocksdb.table.open.prefetch.tail.miss is harder to UT so I manually set prefetch tail read bytes to be small and run db bench.**
```
./db_bench -db=/tmp/testdb -statistics=true -benchmarks="fillseq" -key_size=32 -value_size=512 -num=5000 -write_buffer_size=655 -target_file_size_base=655 -disable_auto_compactions=false -compression_type=none -bloom_bits=3 -use_direct_reads=true
```
```
rocksdb.table.open.prefetch.tail.read.bytes P50 : 4096.000000 P95 : 4096.000000 P99 : 4096.000000 P100 : 4096.000000 COUNT : 225 SUM : 921600
rocksdb.table.open.prefetch.tail.miss COUNT : 91
rocksdb.table.open.prefetch.tail.hit COUNT : 1034
```
**- No perf regression observed in db_bench**
SETUP command: create same db with ~900 files for pre-change/post-change.
```
./db_bench -db=/tmp/testdb -benchmarks="fillseq" -key_size=32 -value_size=512 -num=500000 -write_buffer_size=655360 -disable_auto_compactions=true -target_file_size_base=16777216 -compression_type=none
```
TEST command 60 runs or til convergence: as suggested by anand1976 and akankshamahajan15, vary `seek_nexts` and `async_io` in testing.
```
./db_bench -use_existing_db=true -db=/tmp/testdb -statistics=false -cache_size=0 -cache_index_and_filter_blocks=false -benchmarks=seekrandom[-X60] -num=50000 -seek_nexts={10, 500, 1000} -async_io={0|1} -use_direct_reads=true
```
async io = 0, direct io read = true
| seek_nexts = 10, 30 runs | seek_nexts = 500, 12 runs | seek_nexts = 1000, 6 runs
-- | -- | -- | --
pre-post change | 4776 (± 28) ops/sec; 24.8 (± 0.1) MB/sec | 288 (± 1) ops/sec; 74.8 (± 0.4) MB/sec | 145 (± 4) ops/sec; 75.6 (± 2.2) MB/sec
post-change | 4790 (± 32) ops/sec; 24.9 (± 0.2) MB/sec | 288 (± 3) ops/sec; 74.7 (± 0.8) MB/sec | 143 (± 3) ops/sec; 74.5 (± 1.6) MB/sec
async io = 1, direct io read = true
| seek_nexts = 10, 54 runs | seek_nexts = 500, 6 runs | seek_nexts = 1000, 4 runs
-- | -- | -- | --
pre-post change | 3350 (± 36) ops/sec; 17.4 (± 0.2) MB/sec | 264 (± 0) ops/sec; 68.7 (± 0.2) MB/sec | 138 (± 1) ops/sec; 71.8 (± 1.0) MB/sec
post-change | 3358 (± 27) ops/sec; 17.4 (± 0.1) MB/sec | 263 (± 2) ops/sec; 68.3 (± 0.8) MB/sec | 139 (± 1) ops/sec; 72.6 (± 0.6) MB/sec
Reviewed By: ajkr
Differential Revision: D43781467
Pulled By: hx235
fbshipit-source-id: a706a18472a8edb2b952bac3af40eec803537f2a
2023-03-15 21:02:43 +00:00
|
|
|
ASSERT_GT(cur_table_open_prefetch_tail_read.count,
|
|
|
|
prev_table_open_prefetch_tail_read.count);
|
|
|
|
ASSERT_GT(cur_table_open_prefetch_tail_hit,
|
|
|
|
prev_table_open_prefetch_tail_hit);
|
|
|
|
ASSERT_GE(cur_table_open_prefetch_tail_miss,
|
|
|
|
prev_table_open_prefetch_tail_miss);
|
2020-08-28 01:15:11 +00:00
|
|
|
}
|
|
|
|
|
2024-06-19 16:53:59 +00:00
|
|
|
for (bool disable_io : {false, true}) {
|
|
|
|
SCOPED_TRACE("disable_io: " + std::to_string(disable_io));
|
|
|
|
ReadOptions ro;
|
|
|
|
if (disable_io) {
|
|
|
|
// When this is set on the second iteration, all blocks should be in
|
|
|
|
// block cache
|
|
|
|
ro.read_tier = ReadTier::kBlockCacheTier;
|
|
|
|
}
|
|
|
|
// count the keys
|
|
|
|
{
|
|
|
|
auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ro));
|
|
|
|
int num_keys = 0;
|
|
|
|
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
|
|
|
num_keys++;
|
|
|
|
}
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_EQ(num_keys, kNumKeys);
|
2020-08-28 01:15:11 +00:00
|
|
|
}
|
|
|
|
|
2024-06-19 16:53:59 +00:00
|
|
|
// To verify prefetch during user scan, when IO allowed
|
|
|
|
if (disable_io) {
|
|
|
|
ASSERT_FALSE(fs->IsPrefetchCalled());
|
|
|
|
ASSERT_EQ(0, buff_prefetch_count);
|
|
|
|
} else if (support_prefetch && !use_direct_io) {
|
|
|
|
ASSERT_TRUE(fs->IsPrefetchCalled());
|
|
|
|
fs->ClearPrefetchCount();
|
|
|
|
ASSERT_EQ(0, buff_prefetch_count);
|
|
|
|
} else {
|
|
|
|
ASSERT_FALSE(fs->IsPrefetchCalled());
|
|
|
|
ASSERT_GT(buff_prefetch_count, 0);
|
|
|
|
buff_prefetch_count = 0;
|
|
|
|
}
|
2020-08-28 01:15:11 +00:00
|
|
|
}
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
2023-06-16 20:04:30 +00:00
|
|
|
class PrefetchTailTest : public PrefetchTest {
|
|
|
|
public:
|
|
|
|
bool SupportPrefetch() const {
|
|
|
|
return std::get<0>(GetParam()) &&
|
|
|
|
test::IsPrefetchSupported(env_->GetFileSystem(), dbname_);
|
|
|
|
}
|
Record and use the tail size to prefetch table tail (#11406)
Summary:
**Context:**
We prefetch the tail part of a SST file (i.e, the blocks after data blocks till the end of the file) during each SST file open in hope to prefetch all the stuff at once ahead of time for later read e.g, footer, meta index, filter/index etc. The existing approach to estimate the tail size to prefetch is through `TailPrefetchStats` heuristics introduced in https://github.com/facebook/rocksdb/pull/4156, which has caused small reads in unlucky case (e.g, small read into the tail buffer during table open in thread 1 under the same BlockBasedTableFactory object can make thread 2's tail prefetching use a small size that it shouldn't) and is hard to debug. Therefore we decide to record the exact tail size and use it directly to prefetch tail of the SST instead of relying heuristics.
**Summary:**
- Obtain and record in manifest the tail size in `BlockBasedTableBuilder::Finish()`
- For backward compatibility, we fall back to TailPrefetchStats and last to simple heuristics that the tail size is a linear portion of the file size - see PR conversation for more.
- Make`tail_start_offset` part of the table properties and deduct tail size to record in manifest for external files (e.g, file ingestion, import CF) and db repair (with no access to manifest).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11406
Test Plan:
1. New UT
2. db bench
Note: db bench on /tmp/ where direct read is supported is too slow to finish and the default pinning setting in db bench is not helpful to profile # sst read of Get. Therefore I hacked the following to obtain the following comparison.
```
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index bd5669f0f..791484c1f 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -838,7 +838,7 @@ Status BlockBasedTable::PrefetchTail(
&tail_prefetch_size);
// Try file system prefetch
- if (!file->use_direct_io() && !force_direct_prefetch) {
+ if (false && !file->use_direct_io() && !force_direct_prefetch) {
if (!file->Prefetch(prefetch_off, prefetch_len, ro.rate_limiter_priority)
.IsNotSupported()) {
prefetch_buffer->reset(new FilePrefetchBuffer(
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index ea40f5fa0..39a0ac385 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -4191,6 +4191,8 @@ class Benchmark {
std::shared_ptr<TableFactory>(NewCuckooTableFactory(table_options));
} else {
BlockBasedTableOptions block_based_options;
+ block_based_options.metadata_cache_options.partition_pinning =
+ PinningTier::kAll;
block_based_options.checksum =
static_cast<ChecksumType>(FLAGS_checksum_type);
if (FLAGS_use_hash_search) {
```
Create DB
```
./db_bench --bloom_bits=3 --use_existing_db=1 --seed=1682546046158958 --partition_index_and_filters=1 --statistics=1 -db=/dev/shm/testdb/ -benchmarks=readrandom -key_size=3200 -value_size=512 -num=1000000 -write_buffer_size=6550000 -disable_auto_compactions=false -target_file_size_base=6550000 -compression_type=none
```
ReadRandom
```
./db_bench --bloom_bits=3 --use_existing_db=1 --seed=1682546046158958 --partition_index_and_filters=1 --statistics=1 -db=/dev/shm/testdb/ -benchmarks=readrandom -key_size=3200 -value_size=512 -num=1000000 -write_buffer_size=6550000 -disable_auto_compactions=false -target_file_size_base=6550000 -compression_type=none
```
(a) Existing (Use TailPrefetchStats for tail size + use seperate prefetch buffer in PartitionedFilter/IndexReader::CacheDependencies())
```
rocksdb.table.open.prefetch.tail.hit COUNT : 3395
rocksdb.sst.read.micros P50 : 5.655570 P95 : 9.931396 P99 : 14.845454 P100 : 585.000000 COUNT : 999905 SUM : 6590614
```
(b) This PR (Record tail size + use the same tail buffer in PartitionedFilter/IndexReader::CacheDependencies())
```
rocksdb.table.open.prefetch.tail.hit COUNT : 14257
rocksdb.sst.read.micros P50 : 5.173347 P95 : 9.015017 P99 : 12.912610 P100 : 228.000000 COUNT : 998547 SUM : 5976540
```
As we can see, we increase the prefetch tail hit count and decrease SST read count with this PR
3. Test backward compatibility by stepping through reading with post-PR code on a db generated pre-PR.
Reviewed By: pdillinger
Differential Revision: D45413346
Pulled By: hx235
fbshipit-source-id: 7d5e36a60a72477218f79905168d688452a4c064
2023-05-08 20:14:28 +00:00
|
|
|
|
2023-06-16 20:04:30 +00:00
|
|
|
bool UseDirectIO() const { return std::get<1>(GetParam()); }
|
|
|
|
|
|
|
|
bool UseFilePrefetchBuffer() const {
|
|
|
|
return !SupportPrefetch() || UseDirectIO();
|
|
|
|
}
|
|
|
|
|
|
|
|
Env* GetEnv(bool small_buffer_alignment = false) const {
|
|
|
|
std::shared_ptr<MockFS> fs = std::make_shared<MockFS>(
|
|
|
|
env_->GetFileSystem(), SupportPrefetch(), small_buffer_alignment);
|
|
|
|
|
|
|
|
return new CompositeEnvWrapper(env_, fs);
|
|
|
|
}
|
|
|
|
|
|
|
|
void SetGenericOptions(Env* env, bool use_direct_io,
|
|
|
|
Options& options) override {
|
|
|
|
PrefetchTest::SetGenericOptions(env, use_direct_io, options);
|
|
|
|
options.statistics = CreateDBStatistics();
|
|
|
|
}
|
Record and use the tail size to prefetch table tail (#11406)
Summary:
**Context:**
We prefetch the tail part of a SST file (i.e, the blocks after data blocks till the end of the file) during each SST file open in hope to prefetch all the stuff at once ahead of time for later read e.g, footer, meta index, filter/index etc. The existing approach to estimate the tail size to prefetch is through `TailPrefetchStats` heuristics introduced in https://github.com/facebook/rocksdb/pull/4156, which has caused small reads in unlucky case (e.g, small read into the tail buffer during table open in thread 1 under the same BlockBasedTableFactory object can make thread 2's tail prefetching use a small size that it shouldn't) and is hard to debug. Therefore we decide to record the exact tail size and use it directly to prefetch tail of the SST instead of relying heuristics.
**Summary:**
- Obtain and record in manifest the tail size in `BlockBasedTableBuilder::Finish()`
- For backward compatibility, we fall back to TailPrefetchStats and last to simple heuristics that the tail size is a linear portion of the file size - see PR conversation for more.
- Make`tail_start_offset` part of the table properties and deduct tail size to record in manifest for external files (e.g, file ingestion, import CF) and db repair (with no access to manifest).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11406
Test Plan:
1. New UT
2. db bench
Note: db bench on /tmp/ where direct read is supported is too slow to finish and the default pinning setting in db bench is not helpful to profile # sst read of Get. Therefore I hacked the following to obtain the following comparison.
```
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index bd5669f0f..791484c1f 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -838,7 +838,7 @@ Status BlockBasedTable::PrefetchTail(
&tail_prefetch_size);
// Try file system prefetch
- if (!file->use_direct_io() && !force_direct_prefetch) {
+ if (false && !file->use_direct_io() && !force_direct_prefetch) {
if (!file->Prefetch(prefetch_off, prefetch_len, ro.rate_limiter_priority)
.IsNotSupported()) {
prefetch_buffer->reset(new FilePrefetchBuffer(
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index ea40f5fa0..39a0ac385 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -4191,6 +4191,8 @@ class Benchmark {
std::shared_ptr<TableFactory>(NewCuckooTableFactory(table_options));
} else {
BlockBasedTableOptions block_based_options;
+ block_based_options.metadata_cache_options.partition_pinning =
+ PinningTier::kAll;
block_based_options.checksum =
static_cast<ChecksumType>(FLAGS_checksum_type);
if (FLAGS_use_hash_search) {
```
Create DB
```
./db_bench --bloom_bits=3 --use_existing_db=1 --seed=1682546046158958 --partition_index_and_filters=1 --statistics=1 -db=/dev/shm/testdb/ -benchmarks=readrandom -key_size=3200 -value_size=512 -num=1000000 -write_buffer_size=6550000 -disable_auto_compactions=false -target_file_size_base=6550000 -compression_type=none
```
ReadRandom
```
./db_bench --bloom_bits=3 --use_existing_db=1 --seed=1682546046158958 --partition_index_and_filters=1 --statistics=1 -db=/dev/shm/testdb/ -benchmarks=readrandom -key_size=3200 -value_size=512 -num=1000000 -write_buffer_size=6550000 -disable_auto_compactions=false -target_file_size_base=6550000 -compression_type=none
```
(a) Existing (Use TailPrefetchStats for tail size + use seperate prefetch buffer in PartitionedFilter/IndexReader::CacheDependencies())
```
rocksdb.table.open.prefetch.tail.hit COUNT : 3395
rocksdb.sst.read.micros P50 : 5.655570 P95 : 9.931396 P99 : 14.845454 P100 : 585.000000 COUNT : 999905 SUM : 6590614
```
(b) This PR (Record tail size + use the same tail buffer in PartitionedFilter/IndexReader::CacheDependencies())
```
rocksdb.table.open.prefetch.tail.hit COUNT : 14257
rocksdb.sst.read.micros P50 : 5.173347 P95 : 9.015017 P99 : 12.912610 P100 : 228.000000 COUNT : 998547 SUM : 5976540
```
As we can see, we increase the prefetch tail hit count and decrease SST read count with this PR
3. Test backward compatibility by stepping through reading with post-PR code on a db generated pre-PR.
Reviewed By: pdillinger
Differential Revision: D45413346
Pulled By: hx235
fbshipit-source-id: 7d5e36a60a72477218f79905168d688452a4c064
2023-05-08 20:14:28 +00:00
|
|
|
|
2023-06-16 20:04:30 +00:00
|
|
|
void SetBlockBasedTableOptions(
|
|
|
|
BlockBasedTableOptions& table_options, bool partition_filters = true,
|
|
|
|
uint64_t metadata_block_size =
|
|
|
|
BlockBasedTableOptions().metadata_block_size,
|
|
|
|
bool use_small_cache = false) {
|
|
|
|
table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch;
|
|
|
|
table_options.partition_filters = partition_filters;
|
|
|
|
if (table_options.partition_filters) {
|
|
|
|
table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
|
|
|
|
}
|
|
|
|
table_options.metadata_block_size = metadata_block_size;
|
|
|
|
|
|
|
|
if (use_small_cache) {
|
|
|
|
LRUCacheOptions co;
|
|
|
|
co.capacity = 1;
|
|
|
|
std::shared_ptr<Cache> cache = NewLRUCache(co);
|
|
|
|
table_options.block_cache = cache;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
int64_t GetNumIndexPartition() const {
|
|
|
|
int64_t index_partition_counts = 0;
|
|
|
|
TablePropertiesCollection all_table_props;
|
|
|
|
assert(db_->GetPropertiesOfAllTables(&all_table_props).ok());
|
|
|
|
for (const auto& name_and_table_props : all_table_props) {
|
|
|
|
const auto& table_props = name_and_table_props.second;
|
|
|
|
index_partition_counts += table_props->index_partitions;
|
|
|
|
}
|
|
|
|
return index_partition_counts;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
INSTANTIATE_TEST_CASE_P(PrefetchTailTest, PrefetchTailTest,
|
|
|
|
::testing::Combine(::testing::Bool(),
|
|
|
|
::testing::Bool()));
|
|
|
|
|
|
|
|
TEST_P(PrefetchTailTest, Basic) {
|
|
|
|
std::unique_ptr<Env> env(GetEnv());
|
Record and use the tail size to prefetch table tail (#11406)
Summary:
**Context:**
We prefetch the tail part of a SST file (i.e, the blocks after data blocks till the end of the file) during each SST file open in hope to prefetch all the stuff at once ahead of time for later read e.g, footer, meta index, filter/index etc. The existing approach to estimate the tail size to prefetch is through `TailPrefetchStats` heuristics introduced in https://github.com/facebook/rocksdb/pull/4156, which has caused small reads in unlucky case (e.g, small read into the tail buffer during table open in thread 1 under the same BlockBasedTableFactory object can make thread 2's tail prefetching use a small size that it shouldn't) and is hard to debug. Therefore we decide to record the exact tail size and use it directly to prefetch tail of the SST instead of relying heuristics.
**Summary:**
- Obtain and record in manifest the tail size in `BlockBasedTableBuilder::Finish()`
- For backward compatibility, we fall back to TailPrefetchStats and last to simple heuristics that the tail size is a linear portion of the file size - see PR conversation for more.
- Make`tail_start_offset` part of the table properties and deduct tail size to record in manifest for external files (e.g, file ingestion, import CF) and db repair (with no access to manifest).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11406
Test Plan:
1. New UT
2. db bench
Note: db bench on /tmp/ where direct read is supported is too slow to finish and the default pinning setting in db bench is not helpful to profile # sst read of Get. Therefore I hacked the following to obtain the following comparison.
```
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index bd5669f0f..791484c1f 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -838,7 +838,7 @@ Status BlockBasedTable::PrefetchTail(
&tail_prefetch_size);
// Try file system prefetch
- if (!file->use_direct_io() && !force_direct_prefetch) {
+ if (false && !file->use_direct_io() && !force_direct_prefetch) {
if (!file->Prefetch(prefetch_off, prefetch_len, ro.rate_limiter_priority)
.IsNotSupported()) {
prefetch_buffer->reset(new FilePrefetchBuffer(
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index ea40f5fa0..39a0ac385 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -4191,6 +4191,8 @@ class Benchmark {
std::shared_ptr<TableFactory>(NewCuckooTableFactory(table_options));
} else {
BlockBasedTableOptions block_based_options;
+ block_based_options.metadata_cache_options.partition_pinning =
+ PinningTier::kAll;
block_based_options.checksum =
static_cast<ChecksumType>(FLAGS_checksum_type);
if (FLAGS_use_hash_search) {
```
Create DB
```
./db_bench --bloom_bits=3 --use_existing_db=1 --seed=1682546046158958 --partition_index_and_filters=1 --statistics=1 -db=/dev/shm/testdb/ -benchmarks=readrandom -key_size=3200 -value_size=512 -num=1000000 -write_buffer_size=6550000 -disable_auto_compactions=false -target_file_size_base=6550000 -compression_type=none
```
ReadRandom
```
./db_bench --bloom_bits=3 --use_existing_db=1 --seed=1682546046158958 --partition_index_and_filters=1 --statistics=1 -db=/dev/shm/testdb/ -benchmarks=readrandom -key_size=3200 -value_size=512 -num=1000000 -write_buffer_size=6550000 -disable_auto_compactions=false -target_file_size_base=6550000 -compression_type=none
```
(a) Existing (Use TailPrefetchStats for tail size + use seperate prefetch buffer in PartitionedFilter/IndexReader::CacheDependencies())
```
rocksdb.table.open.prefetch.tail.hit COUNT : 3395
rocksdb.sst.read.micros P50 : 5.655570 P95 : 9.931396 P99 : 14.845454 P100 : 585.000000 COUNT : 999905 SUM : 6590614
```
(b) This PR (Record tail size + use the same tail buffer in PartitionedFilter/IndexReader::CacheDependencies())
```
rocksdb.table.open.prefetch.tail.hit COUNT : 14257
rocksdb.sst.read.micros P50 : 5.173347 P95 : 9.015017 P99 : 12.912610 P100 : 228.000000 COUNT : 998547 SUM : 5976540
```
As we can see, we increase the prefetch tail hit count and decrease SST read count with this PR
3. Test backward compatibility by stepping through reading with post-PR code on a db generated pre-PR.
Reviewed By: pdillinger
Differential Revision: D45413346
Pulled By: hx235
fbshipit-source-id: 7d5e36a60a72477218f79905168d688452a4c064
2023-05-08 20:14:28 +00:00
|
|
|
Options options;
|
2023-06-16 20:04:30 +00:00
|
|
|
SetGenericOptions(env.get(), UseDirectIO(), options);
|
Record and use the tail size to prefetch table tail (#11406)
Summary:
**Context:**
We prefetch the tail part of a SST file (i.e, the blocks after data blocks till the end of the file) during each SST file open in hope to prefetch all the stuff at once ahead of time for later read e.g, footer, meta index, filter/index etc. The existing approach to estimate the tail size to prefetch is through `TailPrefetchStats` heuristics introduced in https://github.com/facebook/rocksdb/pull/4156, which has caused small reads in unlucky case (e.g, small read into the tail buffer during table open in thread 1 under the same BlockBasedTableFactory object can make thread 2's tail prefetching use a small size that it shouldn't) and is hard to debug. Therefore we decide to record the exact tail size and use it directly to prefetch tail of the SST instead of relying heuristics.
**Summary:**
- Obtain and record in manifest the tail size in `BlockBasedTableBuilder::Finish()`
- For backward compatibility, we fall back to TailPrefetchStats and last to simple heuristics that the tail size is a linear portion of the file size - see PR conversation for more.
- Make`tail_start_offset` part of the table properties and deduct tail size to record in manifest for external files (e.g, file ingestion, import CF) and db repair (with no access to manifest).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11406
Test Plan:
1. New UT
2. db bench
Note: db bench on /tmp/ where direct read is supported is too slow to finish and the default pinning setting in db bench is not helpful to profile # sst read of Get. Therefore I hacked the following to obtain the following comparison.
```
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index bd5669f0f..791484c1f 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -838,7 +838,7 @@ Status BlockBasedTable::PrefetchTail(
&tail_prefetch_size);
// Try file system prefetch
- if (!file->use_direct_io() && !force_direct_prefetch) {
+ if (false && !file->use_direct_io() && !force_direct_prefetch) {
if (!file->Prefetch(prefetch_off, prefetch_len, ro.rate_limiter_priority)
.IsNotSupported()) {
prefetch_buffer->reset(new FilePrefetchBuffer(
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index ea40f5fa0..39a0ac385 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -4191,6 +4191,8 @@ class Benchmark {
std::shared_ptr<TableFactory>(NewCuckooTableFactory(table_options));
} else {
BlockBasedTableOptions block_based_options;
+ block_based_options.metadata_cache_options.partition_pinning =
+ PinningTier::kAll;
block_based_options.checksum =
static_cast<ChecksumType>(FLAGS_checksum_type);
if (FLAGS_use_hash_search) {
```
Create DB
```
./db_bench --bloom_bits=3 --use_existing_db=1 --seed=1682546046158958 --partition_index_and_filters=1 --statistics=1 -db=/dev/shm/testdb/ -benchmarks=readrandom -key_size=3200 -value_size=512 -num=1000000 -write_buffer_size=6550000 -disable_auto_compactions=false -target_file_size_base=6550000 -compression_type=none
```
ReadRandom
```
./db_bench --bloom_bits=3 --use_existing_db=1 --seed=1682546046158958 --partition_index_and_filters=1 --statistics=1 -db=/dev/shm/testdb/ -benchmarks=readrandom -key_size=3200 -value_size=512 -num=1000000 -write_buffer_size=6550000 -disable_auto_compactions=false -target_file_size_base=6550000 -compression_type=none
```
(a) Existing (Use TailPrefetchStats for tail size + use seperate prefetch buffer in PartitionedFilter/IndexReader::CacheDependencies())
```
rocksdb.table.open.prefetch.tail.hit COUNT : 3395
rocksdb.sst.read.micros P50 : 5.655570 P95 : 9.931396 P99 : 14.845454 P100 : 585.000000 COUNT : 999905 SUM : 6590614
```
(b) This PR (Record tail size + use the same tail buffer in PartitionedFilter/IndexReader::CacheDependencies())
```
rocksdb.table.open.prefetch.tail.hit COUNT : 14257
rocksdb.sst.read.micros P50 : 5.173347 P95 : 9.015017 P99 : 12.912610 P100 : 228.000000 COUNT : 998547 SUM : 5976540
```
As we can see, we increase the prefetch tail hit count and decrease SST read count with this PR
3. Test backward compatibility by stepping through reading with post-PR code on a db generated pre-PR.
Reviewed By: pdillinger
Differential Revision: D45413346
Pulled By: hx235
fbshipit-source-id: 7d5e36a60a72477218f79905168d688452a4c064
2023-05-08 20:14:28 +00:00
|
|
|
|
|
|
|
BlockBasedTableOptions bbto;
|
2023-06-16 20:04:30 +00:00
|
|
|
SetBlockBasedTableOptions(bbto);
|
Record and use the tail size to prefetch table tail (#11406)
Summary:
**Context:**
We prefetch the tail part of a SST file (i.e, the blocks after data blocks till the end of the file) during each SST file open in hope to prefetch all the stuff at once ahead of time for later read e.g, footer, meta index, filter/index etc. The existing approach to estimate the tail size to prefetch is through `TailPrefetchStats` heuristics introduced in https://github.com/facebook/rocksdb/pull/4156, which has caused small reads in unlucky case (e.g, small read into the tail buffer during table open in thread 1 under the same BlockBasedTableFactory object can make thread 2's tail prefetching use a small size that it shouldn't) and is hard to debug. Therefore we decide to record the exact tail size and use it directly to prefetch tail of the SST instead of relying heuristics.
**Summary:**
- Obtain and record in manifest the tail size in `BlockBasedTableBuilder::Finish()`
- For backward compatibility, we fall back to TailPrefetchStats and last to simple heuristics that the tail size is a linear portion of the file size - see PR conversation for more.
- Make`tail_start_offset` part of the table properties and deduct tail size to record in manifest for external files (e.g, file ingestion, import CF) and db repair (with no access to manifest).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11406
Test Plan:
1. New UT
2. db bench
Note: db bench on /tmp/ where direct read is supported is too slow to finish and the default pinning setting in db bench is not helpful to profile # sst read of Get. Therefore I hacked the following to obtain the following comparison.
```
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index bd5669f0f..791484c1f 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -838,7 +838,7 @@ Status BlockBasedTable::PrefetchTail(
&tail_prefetch_size);
// Try file system prefetch
- if (!file->use_direct_io() && !force_direct_prefetch) {
+ if (false && !file->use_direct_io() && !force_direct_prefetch) {
if (!file->Prefetch(prefetch_off, prefetch_len, ro.rate_limiter_priority)
.IsNotSupported()) {
prefetch_buffer->reset(new FilePrefetchBuffer(
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index ea40f5fa0..39a0ac385 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -4191,6 +4191,8 @@ class Benchmark {
std::shared_ptr<TableFactory>(NewCuckooTableFactory(table_options));
} else {
BlockBasedTableOptions block_based_options;
+ block_based_options.metadata_cache_options.partition_pinning =
+ PinningTier::kAll;
block_based_options.checksum =
static_cast<ChecksumType>(FLAGS_checksum_type);
if (FLAGS_use_hash_search) {
```
Create DB
```
./db_bench --bloom_bits=3 --use_existing_db=1 --seed=1682546046158958 --partition_index_and_filters=1 --statistics=1 -db=/dev/shm/testdb/ -benchmarks=readrandom -key_size=3200 -value_size=512 -num=1000000 -write_buffer_size=6550000 -disable_auto_compactions=false -target_file_size_base=6550000 -compression_type=none
```
ReadRandom
```
./db_bench --bloom_bits=3 --use_existing_db=1 --seed=1682546046158958 --partition_index_and_filters=1 --statistics=1 -db=/dev/shm/testdb/ -benchmarks=readrandom -key_size=3200 -value_size=512 -num=1000000 -write_buffer_size=6550000 -disable_auto_compactions=false -target_file_size_base=6550000 -compression_type=none
```
(a) Existing (Use TailPrefetchStats for tail size + use seperate prefetch buffer in PartitionedFilter/IndexReader::CacheDependencies())
```
rocksdb.table.open.prefetch.tail.hit COUNT : 3395
rocksdb.sst.read.micros P50 : 5.655570 P95 : 9.931396 P99 : 14.845454 P100 : 585.000000 COUNT : 999905 SUM : 6590614
```
(b) This PR (Record tail size + use the same tail buffer in PartitionedFilter/IndexReader::CacheDependencies())
```
rocksdb.table.open.prefetch.tail.hit COUNT : 14257
rocksdb.sst.read.micros P50 : 5.173347 P95 : 9.015017 P99 : 12.912610 P100 : 228.000000 COUNT : 998547 SUM : 5976540
```
As we can see, we increase the prefetch tail hit count and decrease SST read count with this PR
3. Test backward compatibility by stepping through reading with post-PR code on a db generated pre-PR.
Reviewed By: pdillinger
Differential Revision: D45413346
Pulled By: hx235
fbshipit-source-id: 7d5e36a60a72477218f79905168d688452a4c064
2023-05-08 20:14:28 +00:00
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(bbto));
|
|
|
|
|
|
|
|
Status s = TryReopen(options);
|
2023-06-16 20:04:30 +00:00
|
|
|
if (UseDirectIO() && (s.IsNotSupported() || s.IsInvalidArgument())) {
|
Record and use the tail size to prefetch table tail (#11406)
Summary:
**Context:**
We prefetch the tail part of a SST file (i.e, the blocks after data blocks till the end of the file) during each SST file open in hope to prefetch all the stuff at once ahead of time for later read e.g, footer, meta index, filter/index etc. The existing approach to estimate the tail size to prefetch is through `TailPrefetchStats` heuristics introduced in https://github.com/facebook/rocksdb/pull/4156, which has caused small reads in unlucky case (e.g, small read into the tail buffer during table open in thread 1 under the same BlockBasedTableFactory object can make thread 2's tail prefetching use a small size that it shouldn't) and is hard to debug. Therefore we decide to record the exact tail size and use it directly to prefetch tail of the SST instead of relying heuristics.
**Summary:**
- Obtain and record in manifest the tail size in `BlockBasedTableBuilder::Finish()`
- For backward compatibility, we fall back to TailPrefetchStats and last to simple heuristics that the tail size is a linear portion of the file size - see PR conversation for more.
- Make`tail_start_offset` part of the table properties and deduct tail size to record in manifest for external files (e.g, file ingestion, import CF) and db repair (with no access to manifest).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11406
Test Plan:
1. New UT
2. db bench
Note: db bench on /tmp/ where direct read is supported is too slow to finish and the default pinning setting in db bench is not helpful to profile # sst read of Get. Therefore I hacked the following to obtain the following comparison.
```
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index bd5669f0f..791484c1f 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -838,7 +838,7 @@ Status BlockBasedTable::PrefetchTail(
&tail_prefetch_size);
// Try file system prefetch
- if (!file->use_direct_io() && !force_direct_prefetch) {
+ if (false && !file->use_direct_io() && !force_direct_prefetch) {
if (!file->Prefetch(prefetch_off, prefetch_len, ro.rate_limiter_priority)
.IsNotSupported()) {
prefetch_buffer->reset(new FilePrefetchBuffer(
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index ea40f5fa0..39a0ac385 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -4191,6 +4191,8 @@ class Benchmark {
std::shared_ptr<TableFactory>(NewCuckooTableFactory(table_options));
} else {
BlockBasedTableOptions block_based_options;
+ block_based_options.metadata_cache_options.partition_pinning =
+ PinningTier::kAll;
block_based_options.checksum =
static_cast<ChecksumType>(FLAGS_checksum_type);
if (FLAGS_use_hash_search) {
```
Create DB
```
./db_bench --bloom_bits=3 --use_existing_db=1 --seed=1682546046158958 --partition_index_and_filters=1 --statistics=1 -db=/dev/shm/testdb/ -benchmarks=readrandom -key_size=3200 -value_size=512 -num=1000000 -write_buffer_size=6550000 -disable_auto_compactions=false -target_file_size_base=6550000 -compression_type=none
```
ReadRandom
```
./db_bench --bloom_bits=3 --use_existing_db=1 --seed=1682546046158958 --partition_index_and_filters=1 --statistics=1 -db=/dev/shm/testdb/ -benchmarks=readrandom -key_size=3200 -value_size=512 -num=1000000 -write_buffer_size=6550000 -disable_auto_compactions=false -target_file_size_base=6550000 -compression_type=none
```
(a) Existing (Use TailPrefetchStats for tail size + use seperate prefetch buffer in PartitionedFilter/IndexReader::CacheDependencies())
```
rocksdb.table.open.prefetch.tail.hit COUNT : 3395
rocksdb.sst.read.micros P50 : 5.655570 P95 : 9.931396 P99 : 14.845454 P100 : 585.000000 COUNT : 999905 SUM : 6590614
```
(b) This PR (Record tail size + use the same tail buffer in PartitionedFilter/IndexReader::CacheDependencies())
```
rocksdb.table.open.prefetch.tail.hit COUNT : 14257
rocksdb.sst.read.micros P50 : 5.173347 P95 : 9.015017 P99 : 12.912610 P100 : 228.000000 COUNT : 998547 SUM : 5976540
```
As we can see, we increase the prefetch tail hit count and decrease SST read count with this PR
3. Test backward compatibility by stepping through reading with post-PR code on a db generated pre-PR.
Reviewed By: pdillinger
Differential Revision: D45413346
Pulled By: hx235
fbshipit-source-id: 7d5e36a60a72477218f79905168d688452a4c064
2023-05-08 20:14:28 +00:00
|
|
|
// If direct IO is not supported, skip the test
|
|
|
|
ROCKSDB_GTEST_BYPASS("Direct IO is not supported");
|
|
|
|
return;
|
|
|
|
} else {
|
|
|
|
ASSERT_OK(s);
|
|
|
|
}
|
|
|
|
|
|
|
|
ASSERT_OK(Put("k1", "v1"));
|
|
|
|
|
|
|
|
HistogramData pre_flush_file_read;
|
|
|
|
options.statistics->histogramData(FILE_READ_FLUSH_MICROS,
|
|
|
|
&pre_flush_file_read);
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
HistogramData post_flush_file_read;
|
|
|
|
options.statistics->histogramData(FILE_READ_FLUSH_MICROS,
|
|
|
|
&post_flush_file_read);
|
2023-06-16 20:04:30 +00:00
|
|
|
if (UseFilePrefetchBuffer()) {
|
Record and use the tail size to prefetch table tail (#11406)
Summary:
**Context:**
We prefetch the tail part of a SST file (i.e, the blocks after data blocks till the end of the file) during each SST file open in hope to prefetch all the stuff at once ahead of time for later read e.g, footer, meta index, filter/index etc. The existing approach to estimate the tail size to prefetch is through `TailPrefetchStats` heuristics introduced in https://github.com/facebook/rocksdb/pull/4156, which has caused small reads in unlucky case (e.g, small read into the tail buffer during table open in thread 1 under the same BlockBasedTableFactory object can make thread 2's tail prefetching use a small size that it shouldn't) and is hard to debug. Therefore we decide to record the exact tail size and use it directly to prefetch tail of the SST instead of relying heuristics.
**Summary:**
- Obtain and record in manifest the tail size in `BlockBasedTableBuilder::Finish()`
- For backward compatibility, we fall back to TailPrefetchStats and last to simple heuristics that the tail size is a linear portion of the file size - see PR conversation for more.
- Make`tail_start_offset` part of the table properties and deduct tail size to record in manifest for external files (e.g, file ingestion, import CF) and db repair (with no access to manifest).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11406
Test Plan:
1. New UT
2. db bench
Note: db bench on /tmp/ where direct read is supported is too slow to finish and the default pinning setting in db bench is not helpful to profile # sst read of Get. Therefore I hacked the following to obtain the following comparison.
```
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index bd5669f0f..791484c1f 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -838,7 +838,7 @@ Status BlockBasedTable::PrefetchTail(
&tail_prefetch_size);
// Try file system prefetch
- if (!file->use_direct_io() && !force_direct_prefetch) {
+ if (false && !file->use_direct_io() && !force_direct_prefetch) {
if (!file->Prefetch(prefetch_off, prefetch_len, ro.rate_limiter_priority)
.IsNotSupported()) {
prefetch_buffer->reset(new FilePrefetchBuffer(
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index ea40f5fa0..39a0ac385 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -4191,6 +4191,8 @@ class Benchmark {
std::shared_ptr<TableFactory>(NewCuckooTableFactory(table_options));
} else {
BlockBasedTableOptions block_based_options;
+ block_based_options.metadata_cache_options.partition_pinning =
+ PinningTier::kAll;
block_based_options.checksum =
static_cast<ChecksumType>(FLAGS_checksum_type);
if (FLAGS_use_hash_search) {
```
Create DB
```
./db_bench --bloom_bits=3 --use_existing_db=1 --seed=1682546046158958 --partition_index_and_filters=1 --statistics=1 -db=/dev/shm/testdb/ -benchmarks=readrandom -key_size=3200 -value_size=512 -num=1000000 -write_buffer_size=6550000 -disable_auto_compactions=false -target_file_size_base=6550000 -compression_type=none
```
ReadRandom
```
./db_bench --bloom_bits=3 --use_existing_db=1 --seed=1682546046158958 --partition_index_and_filters=1 --statistics=1 -db=/dev/shm/testdb/ -benchmarks=readrandom -key_size=3200 -value_size=512 -num=1000000 -write_buffer_size=6550000 -disable_auto_compactions=false -target_file_size_base=6550000 -compression_type=none
```
(a) Existing (Use TailPrefetchStats for tail size + use seperate prefetch buffer in PartitionedFilter/IndexReader::CacheDependencies())
```
rocksdb.table.open.prefetch.tail.hit COUNT : 3395
rocksdb.sst.read.micros P50 : 5.655570 P95 : 9.931396 P99 : 14.845454 P100 : 585.000000 COUNT : 999905 SUM : 6590614
```
(b) This PR (Record tail size + use the same tail buffer in PartitionedFilter/IndexReader::CacheDependencies())
```
rocksdb.table.open.prefetch.tail.hit COUNT : 14257
rocksdb.sst.read.micros P50 : 5.173347 P95 : 9.015017 P99 : 12.912610 P100 : 228.000000 COUNT : 998547 SUM : 5976540
```
As we can see, we increase the prefetch tail hit count and decrease SST read count with this PR
3. Test backward compatibility by stepping through reading with post-PR code on a db generated pre-PR.
Reviewed By: pdillinger
Differential Revision: D45413346
Pulled By: hx235
fbshipit-source-id: 7d5e36a60a72477218f79905168d688452a4c064
2023-05-08 20:14:28 +00:00
|
|
|
// `PartitionedFilterBlockReader/PartitionIndexReader::CacheDependencies()`
|
|
|
|
// should read from the prefetched tail in file prefetch buffer instead of
|
|
|
|
// initiating extra SST reads. Therefore `BlockBasedTable::PrefetchTail()`
|
|
|
|
// should be the only SST read in table verification during flush.
|
|
|
|
ASSERT_EQ(post_flush_file_read.count - pre_flush_file_read.count, 1);
|
|
|
|
} else {
|
|
|
|
// Without the prefetched tail in file prefetch buffer,
|
|
|
|
// `PartitionedFilterBlockReader/PartitionIndexReader::CacheDependencies()`
|
|
|
|
// will initiate extra SST reads
|
|
|
|
ASSERT_GT(post_flush_file_read.count - pre_flush_file_read.count, 1);
|
|
|
|
}
|
|
|
|
ASSERT_OK(Put("k1", "v2"));
|
|
|
|
ASSERT_OK(Put("k2", "v2"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
CompactRangeOptions cro;
|
|
|
|
HistogramData pre_compaction_file_read;
|
|
|
|
options.statistics->histogramData(FILE_READ_COMPACTION_MICROS,
|
|
|
|
&pre_compaction_file_read);
|
|
|
|
ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
|
|
|
|
HistogramData post_compaction_file_read;
|
|
|
|
options.statistics->histogramData(FILE_READ_COMPACTION_MICROS,
|
|
|
|
&post_compaction_file_read);
|
2023-06-16 20:04:30 +00:00
|
|
|
if (UseFilePrefetchBuffer()) {
|
Record and use the tail size to prefetch table tail (#11406)
Summary:
**Context:**
We prefetch the tail part of a SST file (i.e, the blocks after data blocks till the end of the file) during each SST file open in hope to prefetch all the stuff at once ahead of time for later read e.g, footer, meta index, filter/index etc. The existing approach to estimate the tail size to prefetch is through `TailPrefetchStats` heuristics introduced in https://github.com/facebook/rocksdb/pull/4156, which has caused small reads in unlucky case (e.g, small read into the tail buffer during table open in thread 1 under the same BlockBasedTableFactory object can make thread 2's tail prefetching use a small size that it shouldn't) and is hard to debug. Therefore we decide to record the exact tail size and use it directly to prefetch tail of the SST instead of relying heuristics.
**Summary:**
- Obtain and record in manifest the tail size in `BlockBasedTableBuilder::Finish()`
- For backward compatibility, we fall back to TailPrefetchStats and last to simple heuristics that the tail size is a linear portion of the file size - see PR conversation for more.
- Make`tail_start_offset` part of the table properties and deduct tail size to record in manifest for external files (e.g, file ingestion, import CF) and db repair (with no access to manifest).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11406
Test Plan:
1. New UT
2. db bench
Note: db bench on /tmp/ where direct read is supported is too slow to finish and the default pinning setting in db bench is not helpful to profile # sst read of Get. Therefore I hacked the following to obtain the following comparison.
```
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index bd5669f0f..791484c1f 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -838,7 +838,7 @@ Status BlockBasedTable::PrefetchTail(
&tail_prefetch_size);
// Try file system prefetch
- if (!file->use_direct_io() && !force_direct_prefetch) {
+ if (false && !file->use_direct_io() && !force_direct_prefetch) {
if (!file->Prefetch(prefetch_off, prefetch_len, ro.rate_limiter_priority)
.IsNotSupported()) {
prefetch_buffer->reset(new FilePrefetchBuffer(
diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc
index ea40f5fa0..39a0ac385 100644
--- a/tools/db_bench_tool.cc
+++ b/tools/db_bench_tool.cc
@@ -4191,6 +4191,8 @@ class Benchmark {
std::shared_ptr<TableFactory>(NewCuckooTableFactory(table_options));
} else {
BlockBasedTableOptions block_based_options;
+ block_based_options.metadata_cache_options.partition_pinning =
+ PinningTier::kAll;
block_based_options.checksum =
static_cast<ChecksumType>(FLAGS_checksum_type);
if (FLAGS_use_hash_search) {
```
Create DB
```
./db_bench --bloom_bits=3 --use_existing_db=1 --seed=1682546046158958 --partition_index_and_filters=1 --statistics=1 -db=/dev/shm/testdb/ -benchmarks=readrandom -key_size=3200 -value_size=512 -num=1000000 -write_buffer_size=6550000 -disable_auto_compactions=false -target_file_size_base=6550000 -compression_type=none
```
ReadRandom
```
./db_bench --bloom_bits=3 --use_existing_db=1 --seed=1682546046158958 --partition_index_and_filters=1 --statistics=1 -db=/dev/shm/testdb/ -benchmarks=readrandom -key_size=3200 -value_size=512 -num=1000000 -write_buffer_size=6550000 -disable_auto_compactions=false -target_file_size_base=6550000 -compression_type=none
```
(a) Existing (Use TailPrefetchStats for tail size + use seperate prefetch buffer in PartitionedFilter/IndexReader::CacheDependencies())
```
rocksdb.table.open.prefetch.tail.hit COUNT : 3395
rocksdb.sst.read.micros P50 : 5.655570 P95 : 9.931396 P99 : 14.845454 P100 : 585.000000 COUNT : 999905 SUM : 6590614
```
(b) This PR (Record tail size + use the same tail buffer in PartitionedFilter/IndexReader::CacheDependencies())
```
rocksdb.table.open.prefetch.tail.hit COUNT : 14257
rocksdb.sst.read.micros P50 : 5.173347 P95 : 9.015017 P99 : 12.912610 P100 : 228.000000 COUNT : 998547 SUM : 5976540
```
As we can see, we increase the prefetch tail hit count and decrease SST read count with this PR
3. Test backward compatibility by stepping through reading with post-PR code on a db generated pre-PR.
Reviewed By: pdillinger
Differential Revision: D45413346
Pulled By: hx235
fbshipit-source-id: 7d5e36a60a72477218f79905168d688452a4c064
2023-05-08 20:14:28 +00:00
|
|
|
// `PartitionedFilterBlockReader/PartitionIndexReader::CacheDependencies()`
|
|
|
|
// should read from the prefetched tail in file prefetch buffer instead of
|
|
|
|
// initiating extra SST reads.
|
|
|
|
//
|
|
|
|
// Therefore the 3 reads are
|
|
|
|
// (1) `ProcessKeyValueCompaction()` of input file 1
|
|
|
|
// (2) `ProcessKeyValueCompaction()` of input file 2
|
|
|
|
// (3) `BlockBasedTable::PrefetchTail()` of output file during table
|
|
|
|
// verification in compaction
|
|
|
|
ASSERT_EQ(post_compaction_file_read.count - pre_compaction_file_read.count,
|
|
|
|
3);
|
|
|
|
} else {
|
|
|
|
// Without the prefetched tail in file prefetch buffer,
|
|
|
|
// `PartitionedFilterBlockReader/PartitionIndexReader::CacheDependencies()`
|
|
|
|
// as well as reading other parts of the tail (e.g, footer, table
|
|
|
|
// properties..) will initiate extra SST reads
|
|
|
|
ASSERT_GT(post_compaction_file_read.count - pre_compaction_file_read.count,
|
|
|
|
3);
|
|
|
|
}
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
2023-06-16 20:04:30 +00:00
|
|
|
TEST_P(PrefetchTailTest, UpgradeToTailSizeInManifest) {
|
|
|
|
if (!UseFilePrefetchBuffer()) {
|
|
|
|
ROCKSDB_GTEST_BYPASS(
|
|
|
|
"Upgrade to tail size in manifest is only relevant when RocksDB file "
|
|
|
|
"prefetch buffer is used.");
|
|
|
|
}
|
|
|
|
if (UseDirectIO()) {
|
|
|
|
ROCKSDB_GTEST_BYPASS(
|
2023-08-18 22:52:04 +00:00
|
|
|
"To simplify testing logics with setting file's buffer alignment to "
|
|
|
|
"be "
|
2023-06-16 20:04:30 +00:00
|
|
|
"1, direct IO is required to be disabled.");
|
|
|
|
}
|
|
|
|
|
|
|
|
std::unique_ptr<Env> env(GetEnv(true /* small_buffer_alignment */));
|
|
|
|
Options options;
|
|
|
|
SetGenericOptions(env.get(), false /* use_direct_io*/, options);
|
|
|
|
options.max_open_files = -1;
|
|
|
|
options.write_buffer_size = 1024 * 1024;
|
|
|
|
|
|
|
|
BlockBasedTableOptions table_options;
|
|
|
|
SetBlockBasedTableOptions(table_options, false /* partition_filters */,
|
|
|
|
1 /* metadata_block_size*/,
|
|
|
|
true /* use_small_cache */);
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
// To simulate a pre-upgrade DB where file tail size is not recorded in
|
|
|
|
// manifest
|
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"FileMetaData::FileMetaData", [&](void* arg) {
|
|
|
|
FileMetaData* meta = static_cast<FileMetaData*>(arg);
|
|
|
|
meta->tail_size = 0;
|
|
|
|
});
|
|
|
|
|
|
|
|
ASSERT_OK(TryReopen(options));
|
|
|
|
for (int i = 0; i < 10000; ++i) {
|
|
|
|
ASSERT_OK(Put("k" + std::to_string(i), "v"));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
SyncPoint::GetInstance()->ClearAllCallBacks();
|
|
|
|
|
|
|
|
// To simulate a DB undergoing the upgrade where tail size to prefetch is
|
|
|
|
// inferred to be a small number for files with no tail size recorded in
|
|
|
|
// manifest.
|
|
|
|
// "1" is chosen to be such number so that with `small_buffer_alignment ==
|
2023-08-18 22:52:04 +00:00
|
|
|
// true` and `use_small_cache == true`, it would have caused one file read
|
|
|
|
// per index partition during db open if the upgrade is done wrong.
|
2023-06-16 20:04:30 +00:00
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"BlockBasedTable::Open::TailPrefetchLen", [&](void* arg) {
|
|
|
|
std::pair<size_t*, size_t*>* prefetch_off_len_pair =
|
|
|
|
static_cast<std::pair<size_t*, size_t*>*>(arg);
|
|
|
|
size_t* prefetch_off = prefetch_off_len_pair->first;
|
|
|
|
size_t* tail_size = prefetch_off_len_pair->second;
|
|
|
|
const size_t file_size = *prefetch_off + *tail_size;
|
|
|
|
|
|
|
|
*tail_size = 1;
|
|
|
|
*prefetch_off = file_size - (*tail_size);
|
|
|
|
});
|
|
|
|
|
|
|
|
ASSERT_OK(TryReopen(options));
|
|
|
|
|
|
|
|
SyncPoint::GetInstance()->ClearAllCallBacks();
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
|
|
|
|
HistogramData db_open_file_read;
|
|
|
|
options.statistics->histogramData(FILE_READ_DB_OPEN_MICROS,
|
|
|
|
&db_open_file_read);
|
|
|
|
|
|
|
|
int64_t num_index_partition = GetNumIndexPartition();
|
|
|
|
// If the upgrade is done right, db open will prefetch all the index
|
|
|
|
// partitions at once, instead of doing one read per partition.
|
2023-08-18 22:52:04 +00:00
|
|
|
// That is, together with `metadata_block_size == 1`, there will be more
|
|
|
|
// index partitions than number of non index partitions reads.
|
2023-06-16 20:04:30 +00:00
|
|
|
ASSERT_LT(db_open_file_read.count, num_index_partition);
|
|
|
|
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
2023-01-20 18:17:57 +00:00
|
|
|
// This test verifies BlockBasedTableOptions.max_auto_readahead_size is
|
|
|
|
// configured dynamically.
|
2021-02-24 00:52:35 +00:00
|
|
|
TEST_P(PrefetchTest, ConfigureAutoMaxReadaheadSize) {
|
|
|
|
// First param is if the mockFS support_prefetch or not
|
|
|
|
bool support_prefetch =
|
|
|
|
std::get<0>(GetParam()) &&
|
|
|
|
test::IsPrefetchSupported(env_->GetFileSystem(), dbname_);
|
2023-01-20 18:17:57 +00:00
|
|
|
std::shared_ptr<MockFS> fs =
|
|
|
|
std::make_shared<MockFS>(env_->GetFileSystem(), support_prefetch);
|
2021-02-24 00:52:35 +00:00
|
|
|
|
|
|
|
// Second param is if directIO is enabled or not
|
|
|
|
bool use_direct_io = std::get<1>(GetParam());
|
|
|
|
|
|
|
|
std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
|
2023-01-20 18:17:57 +00:00
|
|
|
Options options;
|
|
|
|
SetGenericOptions(env.get(), use_direct_io, options);
|
2021-02-24 00:52:35 +00:00
|
|
|
BlockBasedTableOptions table_options;
|
2023-01-20 18:17:57 +00:00
|
|
|
SetBlockBasedTableOptions(table_options);
|
2021-02-24 00:52:35 +00:00
|
|
|
table_options.max_auto_readahead_size = 0;
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
|
|
|
|
int buff_prefetch_count = 0;
|
|
|
|
SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start",
|
|
|
|
[&](void*) { buff_prefetch_count++; });
|
|
|
|
|
|
|
|
// DB open will create table readers unless we reduce the table cache
|
|
|
|
// capacity. SanitizeOptions will set max_open_files to minimum of 20. Table
|
|
|
|
// cache is allocated with max_open_files - 10 as capacity. So override
|
|
|
|
// max_open_files to 10 so table cache capacity will become 0. This will
|
|
|
|
// prevent file open during DB open and force the file to be opened during
|
|
|
|
// Iteration.
|
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
|
|
|
|
int* max_open_files = (int*)arg;
|
|
|
|
*max_open_files = 11;
|
|
|
|
});
|
|
|
|
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
Status s = TryReopen(options);
|
|
|
|
|
|
|
|
if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) {
|
|
|
|
// If direct IO is not supported, skip the test
|
|
|
|
return;
|
|
|
|
} else {
|
|
|
|
ASSERT_OK(s);
|
|
|
|
}
|
|
|
|
|
|
|
|
Random rnd(309);
|
|
|
|
int key_count = 0;
|
|
|
|
const int num_keys_per_level = 100;
|
|
|
|
// Level 0 : Keys in range [0, 99], Level 1:[100, 199], Level 2:[200, 299].
|
|
|
|
for (int level = 2; level >= 0; level--) {
|
|
|
|
key_count = level * num_keys_per_level;
|
|
|
|
for (int i = 0; i < num_keys_per_level; ++i) {
|
|
|
|
ASSERT_OK(Put(Key(key_count++), rnd.RandomString(500)));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
MoveFilesToLevel(level);
|
|
|
|
}
|
|
|
|
Close();
|
|
|
|
std::vector<int> buff_prefectch_level_count = {0, 0, 0};
|
2023-08-09 22:46:44 +00:00
|
|
|
ASSERT_OK(TryReopen(options));
|
2021-02-24 00:52:35 +00:00
|
|
|
{
|
|
|
|
auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
|
|
|
|
fs->ClearPrefetchCount();
|
|
|
|
buff_prefetch_count = 0;
|
|
|
|
|
|
|
|
for (int level = 2; level >= 0; level--) {
|
|
|
|
key_count = level * num_keys_per_level;
|
|
|
|
switch (level) {
|
|
|
|
case 0:
|
|
|
|
// max_auto_readahead_size is set 0 so data and index blocks are not
|
|
|
|
// prefetched.
|
|
|
|
ASSERT_OK(db_->SetOptions(
|
|
|
|
{{"block_based_table_factory", "{max_auto_readahead_size=0;}"}}));
|
|
|
|
break;
|
|
|
|
case 1:
|
|
|
|
// max_auto_readahead_size is set less than
|
2022-04-16 00:28:09 +00:00
|
|
|
// initial_auto_readahead_size. So readahead_size remains equal to
|
|
|
|
// max_auto_readahead_size.
|
2021-02-24 00:52:35 +00:00
|
|
|
ASSERT_OK(db_->SetOptions({{"block_based_table_factory",
|
|
|
|
"{max_auto_readahead_size=4096;}"}}));
|
|
|
|
break;
|
|
|
|
case 2:
|
|
|
|
ASSERT_OK(db_->SetOptions({{"block_based_table_factory",
|
|
|
|
"{max_auto_readahead_size=65536;}"}}));
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
assert(false);
|
|
|
|
}
|
2024-10-25 17:24:54 +00:00
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_OK(iter->Refresh()); // Update to latest mutable options
|
2021-02-24 00:52:35 +00:00
|
|
|
|
|
|
|
for (int i = 0; i < num_keys_per_level; ++i) {
|
|
|
|
iter->Seek(Key(key_count++));
|
|
|
|
iter->Next();
|
|
|
|
}
|
|
|
|
|
|
|
|
buff_prefectch_level_count[level] = buff_prefetch_count;
|
|
|
|
if (support_prefetch && !use_direct_io) {
|
|
|
|
if (level == 0) {
|
|
|
|
ASSERT_FALSE(fs->IsPrefetchCalled());
|
|
|
|
} else {
|
|
|
|
ASSERT_TRUE(fs->IsPrefetchCalled());
|
|
|
|
}
|
|
|
|
fs->ClearPrefetchCount();
|
|
|
|
} else {
|
|
|
|
ASSERT_FALSE(fs->IsPrefetchCalled());
|
|
|
|
if (level == 0) {
|
|
|
|
ASSERT_EQ(buff_prefetch_count, 0);
|
|
|
|
} else {
|
|
|
|
ASSERT_GT(buff_prefetch_count, 0);
|
|
|
|
}
|
|
|
|
buff_prefetch_count = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!support_prefetch) {
|
|
|
|
ASSERT_GT(buff_prefectch_level_count[1], buff_prefectch_level_count[2]);
|
|
|
|
}
|
|
|
|
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
SyncPoint::GetInstance()->ClearAllCallBacks();
|
|
|
|
Close();
|
|
|
|
}
|
2022-04-16 00:28:09 +00:00
|
|
|
|
2023-01-20 18:17:57 +00:00
|
|
|
// This test verifies BlockBasedTableOptions.initial_auto_readahead_size is
|
|
|
|
// configured dynamically.
|
2022-04-16 00:28:09 +00:00
|
|
|
TEST_P(PrefetchTest, ConfigureInternalAutoReadaheadSize) {
|
|
|
|
// First param is if the mockFS support_prefetch or not
|
|
|
|
bool support_prefetch =
|
|
|
|
std::get<0>(GetParam()) &&
|
|
|
|
test::IsPrefetchSupported(env_->GetFileSystem(), dbname_);
|
|
|
|
|
|
|
|
// Second param is if directIO is enabled or not
|
|
|
|
bool use_direct_io = std::get<1>(GetParam());
|
|
|
|
|
|
|
|
std::shared_ptr<MockFS> fs =
|
|
|
|
std::make_shared<MockFS>(env_->GetFileSystem(), support_prefetch);
|
|
|
|
std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
|
2023-01-20 18:17:57 +00:00
|
|
|
Options options;
|
|
|
|
SetGenericOptions(env.get(), use_direct_io, options);
|
2022-04-16 00:28:09 +00:00
|
|
|
BlockBasedTableOptions table_options;
|
2023-01-20 18:17:57 +00:00
|
|
|
SetBlockBasedTableOptions(table_options);
|
2022-04-16 00:28:09 +00:00
|
|
|
table_options.initial_auto_readahead_size = 0;
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
|
|
|
|
int buff_prefetch_count = 0;
|
|
|
|
// DB open will create table readers unless we reduce the table cache
|
|
|
|
// capacity. SanitizeOptions will set max_open_files to minimum of 20.
|
|
|
|
// Table cache is allocated with max_open_files - 10 as capacity. So
|
|
|
|
// override max_open_files to 10 so table cache capacity will become 0.
|
|
|
|
// This will prevent file open during DB open and force the file to be
|
|
|
|
// opened during Iteration.
|
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
|
|
|
|
int* max_open_files = (int*)arg;
|
|
|
|
*max_open_files = 11;
|
|
|
|
});
|
|
|
|
|
|
|
|
SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start",
|
|
|
|
[&](void*) { buff_prefetch_count++; });
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
Status s = TryReopen(options);
|
|
|
|
|
|
|
|
if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) {
|
|
|
|
// If direct IO is not supported, skip the test
|
|
|
|
return;
|
|
|
|
} else {
|
|
|
|
ASSERT_OK(s);
|
|
|
|
}
|
|
|
|
|
|
|
|
Random rnd(309);
|
|
|
|
int key_count = 0;
|
|
|
|
const int num_keys_per_level = 100;
|
|
|
|
// Level 0 : Keys in range [0, 99], Level 1:[100, 199], Level 2:[200, 299].
|
|
|
|
for (int level = 2; level >= 0; level--) {
|
|
|
|
key_count = level * num_keys_per_level;
|
|
|
|
for (int i = 0; i < num_keys_per_level; ++i) {
|
|
|
|
ASSERT_OK(Put(Key(key_count++), rnd.RandomString(500)));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
MoveFilesToLevel(level);
|
|
|
|
}
|
|
|
|
Close();
|
|
|
|
|
2023-08-09 22:46:44 +00:00
|
|
|
ASSERT_OK(TryReopen(options));
|
2022-04-16 00:28:09 +00:00
|
|
|
{
|
|
|
|
auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
|
|
|
|
fs->ClearPrefetchCount();
|
|
|
|
buff_prefetch_count = 0;
|
|
|
|
std::vector<int> buff_prefetch_level_count = {0, 0, 0};
|
|
|
|
|
|
|
|
for (int level = 2; level >= 0; level--) {
|
|
|
|
key_count = level * num_keys_per_level;
|
|
|
|
switch (level) {
|
|
|
|
case 0:
|
|
|
|
// initial_auto_readahead_size is set 0 so data and index blocks are
|
|
|
|
// not prefetched.
|
|
|
|
ASSERT_OK(db_->SetOptions({{"block_based_table_factory",
|
|
|
|
"{initial_auto_readahead_size=0;}"}}));
|
|
|
|
break;
|
|
|
|
case 1:
|
2023-08-18 22:52:04 +00:00
|
|
|
// intial_auto_readahead_size and max_auto_readahead_size are set
|
|
|
|
// same so readahead_size remains same.
|
2022-04-16 00:28:09 +00:00
|
|
|
ASSERT_OK(db_->SetOptions({{"block_based_table_factory",
|
|
|
|
"{initial_auto_readahead_size=4096;max_"
|
|
|
|
"auto_readahead_size=4096;}"}}));
|
|
|
|
break;
|
|
|
|
case 2:
|
|
|
|
ASSERT_OK(
|
|
|
|
db_->SetOptions({{"block_based_table_factory",
|
|
|
|
"{initial_auto_readahead_size=65536;}"}}));
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
assert(false);
|
|
|
|
}
|
2024-10-25 17:24:54 +00:00
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_OK(iter->Refresh()); // Update to latest mutable options
|
2022-04-16 00:28:09 +00:00
|
|
|
|
|
|
|
for (int i = 0; i < num_keys_per_level; ++i) {
|
|
|
|
iter->Seek(Key(key_count++));
|
|
|
|
iter->Next();
|
|
|
|
}
|
2023-10-18 16:38:38 +00:00
|
|
|
ASSERT_OK(iter->status());
|
2022-04-16 00:28:09 +00:00
|
|
|
|
|
|
|
buff_prefetch_level_count[level] = buff_prefetch_count;
|
|
|
|
if (support_prefetch && !use_direct_io) {
|
|
|
|
if (level == 0) {
|
|
|
|
ASSERT_FALSE(fs->IsPrefetchCalled());
|
|
|
|
} else {
|
|
|
|
ASSERT_TRUE(fs->IsPrefetchCalled());
|
|
|
|
}
|
|
|
|
fs->ClearPrefetchCount();
|
|
|
|
} else {
|
|
|
|
ASSERT_FALSE(fs->IsPrefetchCalled());
|
|
|
|
if (level == 0) {
|
|
|
|
ASSERT_EQ(buff_prefetch_count, 0);
|
|
|
|
} else {
|
|
|
|
ASSERT_GT(buff_prefetch_count, 0);
|
|
|
|
}
|
|
|
|
buff_prefetch_count = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (!support_prefetch) {
|
|
|
|
ASSERT_GT(buff_prefetch_level_count[1], buff_prefetch_level_count[2]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
SyncPoint::GetInstance()->ClearAllCallBacks();
|
|
|
|
Close();
|
|
|
|
}
|
2022-09-01 18:56:00 +00:00
|
|
|
|
2023-01-20 18:17:57 +00:00
|
|
|
// This test verifies BlockBasedTableOptions.num_file_reads_for_auto_readahead
|
|
|
|
// is configured dynamically.
|
2022-09-01 18:56:00 +00:00
|
|
|
TEST_P(PrefetchTest, ConfigureNumFilesReadsForReadaheadSize) {
|
|
|
|
// First param is if the mockFS support_prefetch or not
|
|
|
|
bool support_prefetch =
|
|
|
|
std::get<0>(GetParam()) &&
|
|
|
|
test::IsPrefetchSupported(env_->GetFileSystem(), dbname_);
|
|
|
|
|
|
|
|
const int kNumKeys = 2000;
|
|
|
|
std::shared_ptr<MockFS> fs =
|
|
|
|
std::make_shared<MockFS>(env_->GetFileSystem(), support_prefetch);
|
|
|
|
std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
|
|
|
|
|
|
|
|
// Second param is if directIO is enabled or not
|
|
|
|
bool use_direct_io = std::get<1>(GetParam());
|
|
|
|
|
2023-01-20 18:17:57 +00:00
|
|
|
Options options;
|
|
|
|
SetGenericOptions(env.get(), use_direct_io, options);
|
2022-09-01 18:56:00 +00:00
|
|
|
BlockBasedTableOptions table_options;
|
2023-01-20 18:17:57 +00:00
|
|
|
SetBlockBasedTableOptions(table_options);
|
2022-09-01 18:56:00 +00:00
|
|
|
table_options.num_file_reads_for_auto_readahead = 0;
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
|
|
|
|
int buff_prefetch_count = 0;
|
|
|
|
SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start",
|
|
|
|
[&](void*) { buff_prefetch_count++; });
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
Status s = TryReopen(options);
|
|
|
|
if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) {
|
|
|
|
// If direct IO is not supported, skip the test
|
|
|
|
return;
|
|
|
|
} else {
|
|
|
|
ASSERT_OK(s);
|
|
|
|
}
|
|
|
|
|
|
|
|
WriteBatch batch;
|
|
|
|
Random rnd(309);
|
|
|
|
for (int i = 0; i < kNumKeys; i++) {
|
|
|
|
ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000)));
|
|
|
|
}
|
|
|
|
ASSERT_OK(db_->Write(WriteOptions(), &batch));
|
|
|
|
|
|
|
|
std::string start_key = BuildKey(0);
|
|
|
|
std::string end_key = BuildKey(kNumKeys - 1);
|
|
|
|
Slice least(start_key.data(), start_key.size());
|
|
|
|
Slice greatest(end_key.data(), end_key.size());
|
|
|
|
|
|
|
|
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest));
|
|
|
|
|
|
|
|
Close();
|
2023-08-09 22:46:44 +00:00
|
|
|
ASSERT_OK(TryReopen(options));
|
2022-09-01 18:56:00 +00:00
|
|
|
|
|
|
|
fs->ClearPrefetchCount();
|
|
|
|
buff_prefetch_count = 0;
|
|
|
|
|
|
|
|
{
|
|
|
|
auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
|
|
|
|
/*
|
|
|
|
* Reseek keys from sequential Data Blocks within same partitioned
|
|
|
|
* index. It will prefetch the data block at the first seek since
|
2023-08-18 22:52:04 +00:00
|
|
|
* num_file_reads_for_auto_readahead = 0. Data Block size is nearly 4076
|
|
|
|
* so readahead will fetch 8 * 1024 data more initially (2 more data
|
|
|
|
* blocks).
|
2022-09-01 18:56:00 +00:00
|
|
|
*/
|
|
|
|
iter->Seek(BuildKey(0)); // Prefetch data + index block since
|
|
|
|
// num_file_reads_for_auto_readahead = 0.
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
iter->Seek(BuildKey(1000)); // In buffer
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
iter->Seek(BuildKey(1004)); // In buffer
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
iter->Seek(BuildKey(1008)); // Prefetch Data
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
iter->Seek(BuildKey(1011)); // In buffer
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
iter->Seek(BuildKey(1015)); // In buffer
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
iter->Seek(BuildKey(1019)); // In buffer
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
// Missed 2 blocks but they are already in buffer so no reset.
|
|
|
|
iter->Seek(BuildKey(103)); // Already in buffer.
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
iter->Seek(BuildKey(1033)); // Prefetch Data.
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
if (support_prefetch && !use_direct_io) {
|
|
|
|
ASSERT_EQ(fs->GetPrefetchCount(), 4);
|
|
|
|
fs->ClearPrefetchCount();
|
|
|
|
} else {
|
|
|
|
ASSERT_EQ(buff_prefetch_count, 4);
|
|
|
|
buff_prefetch_count = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
SyncPoint::GetInstance()->ClearAllCallBacks();
|
|
|
|
Close();
|
|
|
|
}
|
2021-02-24 00:52:35 +00:00
|
|
|
|
2023-01-20 18:17:57 +00:00
|
|
|
// This test verifies the basic functionality of implicit autoreadahead:
|
|
|
|
// - Enable implicit autoreadahead and prefetch only if sequential blocks are
|
|
|
|
// read,
|
|
|
|
// - If data is already in buffer and few blocks are not requested to read,
|
|
|
|
// don't reset,
|
|
|
|
// - If data blocks are sequential during read after enabling implicit
|
|
|
|
// autoreadahead, reset readahead parameters.
|
2021-04-28 19:52:53 +00:00
|
|
|
TEST_P(PrefetchTest, PrefetchWhenReseek) {
|
|
|
|
// First param is if the mockFS support_prefetch or not
|
|
|
|
bool support_prefetch =
|
|
|
|
std::get<0>(GetParam()) &&
|
|
|
|
test::IsPrefetchSupported(env_->GetFileSystem(), dbname_);
|
|
|
|
|
|
|
|
const int kNumKeys = 2000;
|
|
|
|
std::shared_ptr<MockFS> fs =
|
|
|
|
std::make_shared<MockFS>(env_->GetFileSystem(), support_prefetch);
|
|
|
|
std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
|
|
|
|
|
|
|
|
// Second param is if directIO is enabled or not
|
|
|
|
bool use_direct_io = std::get<1>(GetParam());
|
|
|
|
|
2023-01-20 18:17:57 +00:00
|
|
|
Options options;
|
|
|
|
SetGenericOptions(env.get(), use_direct_io, options);
|
2021-04-28 19:52:53 +00:00
|
|
|
BlockBasedTableOptions table_options;
|
2023-01-20 18:17:57 +00:00
|
|
|
SetBlockBasedTableOptions(table_options);
|
2021-04-28 19:52:53 +00:00
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
|
|
|
|
int buff_prefetch_count = 0;
|
|
|
|
SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start",
|
|
|
|
[&](void*) { buff_prefetch_count++; });
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
Status s = TryReopen(options);
|
|
|
|
if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) {
|
|
|
|
// If direct IO is not supported, skip the test
|
|
|
|
return;
|
|
|
|
} else {
|
|
|
|
ASSERT_OK(s);
|
|
|
|
}
|
|
|
|
|
|
|
|
WriteBatch batch;
|
|
|
|
Random rnd(309);
|
|
|
|
for (int i = 0; i < kNumKeys; i++) {
|
|
|
|
ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000)));
|
|
|
|
}
|
|
|
|
ASSERT_OK(db_->Write(WriteOptions(), &batch));
|
|
|
|
|
|
|
|
std::string start_key = BuildKey(0);
|
|
|
|
std::string end_key = BuildKey(kNumKeys - 1);
|
|
|
|
Slice least(start_key.data(), start_key.size());
|
|
|
|
Slice greatest(end_key.data(), end_key.size());
|
|
|
|
|
|
|
|
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest));
|
|
|
|
|
|
|
|
fs->ClearPrefetchCount();
|
|
|
|
buff_prefetch_count = 0;
|
|
|
|
|
|
|
|
{
|
|
|
|
auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
|
|
|
|
/*
|
|
|
|
* Reseek keys from sequential Data Blocks within same partitioned
|
|
|
|
* index. After 2 sequential reads it will prefetch the data block.
|
2023-08-18 22:52:04 +00:00
|
|
|
* Data Block size is nearly 4076 so readahead will fetch 8 * 1024 data
|
|
|
|
* more initially (2 more data blocks).
|
2021-04-28 19:52:53 +00:00
|
|
|
*/
|
|
|
|
iter->Seek(BuildKey(0));
|
2022-05-20 23:09:33 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
2021-04-28 19:52:53 +00:00
|
|
|
iter->Seek(BuildKey(1000));
|
2022-05-20 23:09:33 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
2021-04-28 19:52:53 +00:00
|
|
|
iter->Seek(BuildKey(1004)); // Prefetch Data
|
2022-05-20 23:09:33 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
2021-04-28 19:52:53 +00:00
|
|
|
iter->Seek(BuildKey(1008));
|
2022-05-20 23:09:33 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
2021-04-28 19:52:53 +00:00
|
|
|
iter->Seek(BuildKey(1011));
|
2022-05-20 23:09:33 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
2021-04-28 19:52:53 +00:00
|
|
|
iter->Seek(BuildKey(1015)); // Prefetch Data
|
2022-05-20 23:09:33 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
2021-04-28 19:52:53 +00:00
|
|
|
iter->Seek(BuildKey(1019));
|
2022-05-20 23:09:33 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
2021-04-28 19:52:53 +00:00
|
|
|
// Missed 2 blocks but they are already in buffer so no reset.
|
2022-10-25 01:34:52 +00:00
|
|
|
iter->Seek(BuildKey(103)); // Already in buffer.
|
2022-05-20 23:09:33 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
2021-04-28 19:52:53 +00:00
|
|
|
iter->Seek(BuildKey(1033)); // Prefetch Data
|
2022-05-20 23:09:33 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
2021-04-28 19:52:53 +00:00
|
|
|
if (support_prefetch && !use_direct_io) {
|
|
|
|
ASSERT_EQ(fs->GetPrefetchCount(), 3);
|
|
|
|
fs->ClearPrefetchCount();
|
|
|
|
} else {
|
|
|
|
ASSERT_EQ(buff_prefetch_count, 3);
|
|
|
|
buff_prefetch_count = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Reseek keys from non sequential data blocks within same partitioned
|
|
|
|
* index. buff_prefetch_count will be 0 in that case.
|
|
|
|
*/
|
|
|
|
auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
|
|
|
|
iter->Seek(BuildKey(0));
|
2022-05-20 23:09:33 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
2021-04-28 19:52:53 +00:00
|
|
|
iter->Seek(BuildKey(1008));
|
2022-05-20 23:09:33 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
2021-04-28 19:52:53 +00:00
|
|
|
iter->Seek(BuildKey(1019));
|
2022-05-20 23:09:33 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
2021-04-28 19:52:53 +00:00
|
|
|
iter->Seek(BuildKey(1033));
|
2022-05-20 23:09:33 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
2021-04-28 19:52:53 +00:00
|
|
|
iter->Seek(BuildKey(1048));
|
2022-05-20 23:09:33 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
2021-04-28 19:52:53 +00:00
|
|
|
if (support_prefetch && !use_direct_io) {
|
|
|
|
ASSERT_EQ(fs->GetPrefetchCount(), 0);
|
|
|
|
fs->ClearPrefetchCount();
|
|
|
|
} else {
|
|
|
|
ASSERT_EQ(buff_prefetch_count, 0);
|
|
|
|
buff_prefetch_count = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Reesek keys from Single Data Block.
|
|
|
|
*/
|
|
|
|
auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
|
|
|
|
iter->Seek(BuildKey(0));
|
2022-05-20 23:09:33 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
2021-04-28 19:52:53 +00:00
|
|
|
iter->Seek(BuildKey(1));
|
2022-05-20 23:09:33 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
2021-04-28 19:52:53 +00:00
|
|
|
iter->Seek(BuildKey(10));
|
2022-05-20 23:09:33 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
2021-04-28 19:52:53 +00:00
|
|
|
iter->Seek(BuildKey(100));
|
2022-05-20 23:09:33 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
2021-04-28 19:52:53 +00:00
|
|
|
if (support_prefetch && !use_direct_io) {
|
|
|
|
ASSERT_EQ(fs->GetPrefetchCount(), 0);
|
|
|
|
fs->ClearPrefetchCount();
|
|
|
|
} else {
|
|
|
|
ASSERT_EQ(buff_prefetch_count, 0);
|
|
|
|
buff_prefetch_count = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Reseek keys from sequential data blocks to set implicit auto readahead
|
2023-08-18 22:52:04 +00:00
|
|
|
* and prefetch data but after that iterate over different (non
|
|
|
|
* sequential) data blocks which won't prefetch any data further. So
|
|
|
|
* buff_prefetch_count will be 1 for the first one.
|
2021-04-28 19:52:53 +00:00
|
|
|
*/
|
|
|
|
auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
|
|
|
|
iter->Seek(BuildKey(0));
|
2022-05-20 23:09:33 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
2021-04-28 19:52:53 +00:00
|
|
|
iter->Seek(BuildKey(1000));
|
2022-05-20 23:09:33 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
2021-04-28 19:52:53 +00:00
|
|
|
iter->Seek(BuildKey(1004)); // This iteration will prefetch buffer
|
2022-05-20 23:09:33 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
2021-04-28 19:52:53 +00:00
|
|
|
iter->Seek(BuildKey(1008));
|
2022-05-20 23:09:33 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
2021-04-28 19:52:53 +00:00
|
|
|
iter->Seek(
|
|
|
|
BuildKey(996)); // Reseek won't prefetch any data and
|
|
|
|
// readahead_size will be initiallized to 8*1024.
|
2022-05-20 23:09:33 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
2021-04-28 19:52:53 +00:00
|
|
|
iter->Seek(BuildKey(992));
|
2022-05-20 23:09:33 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
2021-04-28 19:52:53 +00:00
|
|
|
iter->Seek(BuildKey(989));
|
2022-05-20 23:09:33 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
2021-04-28 19:52:53 +00:00
|
|
|
if (support_prefetch && !use_direct_io) {
|
|
|
|
ASSERT_EQ(fs->GetPrefetchCount(), 1);
|
|
|
|
fs->ClearPrefetchCount();
|
|
|
|
} else {
|
|
|
|
ASSERT_EQ(buff_prefetch_count, 1);
|
|
|
|
buff_prefetch_count = 0;
|
|
|
|
}
|
|
|
|
|
2023-08-18 22:52:04 +00:00
|
|
|
// Read sequentially to confirm readahead_size is reset to initial value
|
|
|
|
// (2 more data blocks)
|
2021-04-28 19:52:53 +00:00
|
|
|
iter->Seek(BuildKey(1011));
|
2022-05-20 23:09:33 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
2021-04-28 19:52:53 +00:00
|
|
|
iter->Seek(BuildKey(1015));
|
2022-05-20 23:09:33 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
2021-04-28 19:52:53 +00:00
|
|
|
iter->Seek(BuildKey(1019)); // Prefetch Data
|
2022-05-20 23:09:33 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
2021-04-28 19:52:53 +00:00
|
|
|
iter->Seek(BuildKey(1022));
|
2022-05-20 23:09:33 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
2021-04-28 19:52:53 +00:00
|
|
|
iter->Seek(BuildKey(1026));
|
2022-05-20 23:09:33 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
2021-04-28 19:52:53 +00:00
|
|
|
iter->Seek(BuildKey(103)); // Prefetch Data
|
2022-05-20 23:09:33 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
2021-04-28 19:52:53 +00:00
|
|
|
if (support_prefetch && !use_direct_io) {
|
|
|
|
ASSERT_EQ(fs->GetPrefetchCount(), 2);
|
|
|
|
fs->ClearPrefetchCount();
|
|
|
|
} else {
|
|
|
|
ASSERT_EQ(buff_prefetch_count, 2);
|
|
|
|
buff_prefetch_count = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
{
|
|
|
|
/* Reseek keys from sequential partitioned index block. Since partitioned
|
|
|
|
* index fetch are sequential, buff_prefetch_count will be 1.
|
|
|
|
*/
|
|
|
|
auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
|
|
|
|
iter->Seek(BuildKey(0));
|
2022-05-20 23:09:33 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
2021-04-28 19:52:53 +00:00
|
|
|
iter->Seek(BuildKey(1167));
|
2022-05-20 23:09:33 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
2021-04-28 19:52:53 +00:00
|
|
|
iter->Seek(BuildKey(1334)); // This iteration will prefetch buffer
|
2022-05-20 23:09:33 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
2021-04-28 19:52:53 +00:00
|
|
|
iter->Seek(BuildKey(1499));
|
2022-05-20 23:09:33 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
2021-04-28 19:52:53 +00:00
|
|
|
iter->Seek(BuildKey(1667));
|
2022-05-20 23:09:33 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
2021-04-28 19:52:53 +00:00
|
|
|
iter->Seek(BuildKey(1847));
|
2022-05-20 23:09:33 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
2021-04-28 19:52:53 +00:00
|
|
|
iter->Seek(BuildKey(1999));
|
2022-05-20 23:09:33 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
2021-04-28 19:52:53 +00:00
|
|
|
if (support_prefetch && !use_direct_io) {
|
|
|
|
ASSERT_EQ(fs->GetPrefetchCount(), 1);
|
|
|
|
fs->ClearPrefetchCount();
|
|
|
|
} else {
|
|
|
|
ASSERT_EQ(buff_prefetch_count, 1);
|
|
|
|
buff_prefetch_count = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
{
|
|
|
|
/*
|
2023-08-18 22:52:04 +00:00
|
|
|
* Reseek over different keys from different blocks. buff_prefetch_count
|
|
|
|
* is set 0.
|
2021-04-28 19:52:53 +00:00
|
|
|
*/
|
|
|
|
auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
|
|
|
|
int i = 0;
|
|
|
|
int j = 1000;
|
|
|
|
do {
|
|
|
|
iter->Seek(BuildKey(i));
|
|
|
|
if (!iter->Valid()) {
|
2023-10-18 16:38:38 +00:00
|
|
|
ASSERT_OK(iter->status());
|
2021-04-28 19:52:53 +00:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
i = i + 100;
|
|
|
|
iter->Seek(BuildKey(j));
|
|
|
|
j = j + 100;
|
|
|
|
} while (i < 1000 && j < kNumKeys && iter->Valid());
|
|
|
|
if (support_prefetch && !use_direct_io) {
|
|
|
|
ASSERT_EQ(fs->GetPrefetchCount(), 0);
|
|
|
|
fs->ClearPrefetchCount();
|
|
|
|
} else {
|
|
|
|
ASSERT_EQ(buff_prefetch_count, 0);
|
|
|
|
buff_prefetch_count = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
{
|
|
|
|
/* Iterates sequentially over all keys. It will prefetch the buffer.*/
|
|
|
|
auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
|
|
|
|
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
|
|
|
}
|
2023-10-18 16:38:38 +00:00
|
|
|
ASSERT_OK(iter->status());
|
2021-04-28 19:52:53 +00:00
|
|
|
if (support_prefetch && !use_direct_io) {
|
|
|
|
ASSERT_EQ(fs->GetPrefetchCount(), 13);
|
|
|
|
fs->ClearPrefetchCount();
|
|
|
|
} else {
|
|
|
|
ASSERT_EQ(buff_prefetch_count, 13);
|
|
|
|
buff_prefetch_count = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
SyncPoint::GetInstance()->ClearAllCallBacks();
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
2023-01-20 18:17:57 +00:00
|
|
|
// This test verifies the functionality of implicit autoreadahead when caching
|
|
|
|
// is enabled:
|
|
|
|
// - If data is already in buffer and few blocks are not requested to read,
|
|
|
|
// don't reset,
|
|
|
|
// - If block was eligible for prefetching/in buffer but found in cache, don't
|
|
|
|
// prefetch and reset.
|
2021-04-28 19:52:53 +00:00
|
|
|
TEST_P(PrefetchTest, PrefetchWhenReseekwithCache) {
|
|
|
|
// First param is if the mockFS support_prefetch or not
|
|
|
|
bool support_prefetch =
|
|
|
|
std::get<0>(GetParam()) &&
|
|
|
|
test::IsPrefetchSupported(env_->GetFileSystem(), dbname_);
|
|
|
|
|
|
|
|
const int kNumKeys = 2000;
|
|
|
|
std::shared_ptr<MockFS> fs =
|
|
|
|
std::make_shared<MockFS>(env_->GetFileSystem(), support_prefetch);
|
|
|
|
std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
|
|
|
|
|
|
|
|
// Second param is if directIO is enabled or not
|
|
|
|
bool use_direct_io = std::get<1>(GetParam());
|
|
|
|
|
2023-01-20 18:17:57 +00:00
|
|
|
Options options;
|
|
|
|
SetGenericOptions(env.get(), use_direct_io, options);
|
2021-04-28 19:52:53 +00:00
|
|
|
BlockBasedTableOptions table_options;
|
2023-01-20 18:17:57 +00:00
|
|
|
SetBlockBasedTableOptions(table_options);
|
2021-04-28 19:52:53 +00:00
|
|
|
std::shared_ptr<Cache> cache = NewLRUCache(4 * 1024 * 1024, 2); // 8MB
|
|
|
|
table_options.block_cache = cache;
|
2023-01-20 18:17:57 +00:00
|
|
|
table_options.no_block_cache = false;
|
2021-04-28 19:52:53 +00:00
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
|
|
|
|
int buff_prefetch_count = 0;
|
|
|
|
SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start",
|
|
|
|
[&](void*) { buff_prefetch_count++; });
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
Status s = TryReopen(options);
|
|
|
|
if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) {
|
|
|
|
// If direct IO is not supported, skip the test
|
|
|
|
return;
|
|
|
|
} else {
|
|
|
|
ASSERT_OK(s);
|
|
|
|
}
|
|
|
|
|
|
|
|
WriteBatch batch;
|
|
|
|
Random rnd(309);
|
|
|
|
for (int i = 0; i < kNumKeys; i++) {
|
|
|
|
ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000)));
|
|
|
|
}
|
|
|
|
ASSERT_OK(db_->Write(WriteOptions(), &batch));
|
|
|
|
|
|
|
|
std::string start_key = BuildKey(0);
|
|
|
|
std::string end_key = BuildKey(kNumKeys - 1);
|
|
|
|
Slice least(start_key.data(), start_key.size());
|
|
|
|
Slice greatest(end_key.data(), end_key.size());
|
|
|
|
|
|
|
|
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest));
|
|
|
|
|
|
|
|
fs->ClearPrefetchCount();
|
|
|
|
buff_prefetch_count = 0;
|
|
|
|
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Reseek keys from sequential Data Blocks within same partitioned
|
|
|
|
* index. After 2 sequential reads it will prefetch the data block.
|
2023-08-18 22:52:04 +00:00
|
|
|
* Data Block size is nearly 4076 so readahead will fetch 8 * 1024 data
|
|
|
|
* more initially (2 more data blocks).
|
2021-04-28 19:52:53 +00:00
|
|
|
*/
|
|
|
|
auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
|
|
|
|
// Warm up the cache
|
|
|
|
iter->Seek(BuildKey(1011));
|
2022-05-20 23:09:33 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
2021-04-28 19:52:53 +00:00
|
|
|
iter->Seek(BuildKey(1015));
|
2022-05-20 23:09:33 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
2021-04-28 19:52:53 +00:00
|
|
|
iter->Seek(BuildKey(1019));
|
2022-05-20 23:09:33 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
2021-04-28 19:52:53 +00:00
|
|
|
if (support_prefetch && !use_direct_io) {
|
|
|
|
ASSERT_EQ(fs->GetPrefetchCount(), 1);
|
|
|
|
fs->ClearPrefetchCount();
|
|
|
|
} else {
|
|
|
|
ASSERT_EQ(buff_prefetch_count, 1);
|
|
|
|
buff_prefetch_count = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
{
|
|
|
|
// After caching, blocks will be read from cache (Sequential blocks)
|
|
|
|
auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
|
|
|
|
iter->Seek(BuildKey(0));
|
2022-05-20 23:09:33 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
2021-04-28 19:52:53 +00:00
|
|
|
iter->Seek(BuildKey(1000));
|
2022-05-20 23:09:33 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
2021-04-28 19:52:53 +00:00
|
|
|
iter->Seek(BuildKey(1004)); // Prefetch data (not in cache).
|
2022-05-20 23:09:33 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
2023-08-18 22:52:04 +00:00
|
|
|
// Missed one sequential block but next is in already in buffer so
|
|
|
|
// readahead will not be reset.
|
2021-04-28 19:52:53 +00:00
|
|
|
iter->Seek(BuildKey(1011));
|
2022-05-20 23:09:33 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
2021-04-28 19:52:53 +00:00
|
|
|
// Prefetch data but blocks are in cache so no prefetch and reset.
|
|
|
|
iter->Seek(BuildKey(1015));
|
2022-05-20 23:09:33 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
2021-04-28 19:52:53 +00:00
|
|
|
iter->Seek(BuildKey(1019));
|
2022-05-20 23:09:33 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
2021-04-28 19:52:53 +00:00
|
|
|
iter->Seek(BuildKey(1022));
|
2022-05-20 23:09:33 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
2021-04-28 19:52:53 +00:00
|
|
|
// Prefetch data with readahead_size = 4 blocks.
|
|
|
|
iter->Seek(BuildKey(1026));
|
2022-05-20 23:09:33 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
2021-04-28 19:52:53 +00:00
|
|
|
iter->Seek(BuildKey(103));
|
2022-05-20 23:09:33 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
2021-04-28 19:52:53 +00:00
|
|
|
iter->Seek(BuildKey(1033));
|
2022-05-20 23:09:33 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
2021-04-28 19:52:53 +00:00
|
|
|
iter->Seek(BuildKey(1037));
|
2022-05-20 23:09:33 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
2021-04-28 19:52:53 +00:00
|
|
|
|
|
|
|
if (support_prefetch && !use_direct_io) {
|
|
|
|
ASSERT_EQ(fs->GetPrefetchCount(), 3);
|
|
|
|
fs->ClearPrefetchCount();
|
|
|
|
} else {
|
|
|
|
ASSERT_EQ(buff_prefetch_count, 2);
|
|
|
|
buff_prefetch_count = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
SyncPoint::GetInstance()->ClearAllCallBacks();
|
|
|
|
Close();
|
|
|
|
}
|
2020-08-28 01:15:11 +00:00
|
|
|
|
2023-09-23 01:12:08 +00:00
|
|
|
TEST_P(PrefetchTest, PrefetchWithBlockLookupAutoTuneTest) {
|
|
|
|
if (mem_env_ || encrypted_env_) {
|
|
|
|
ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::shared_ptr<MockFS> fs =
|
|
|
|
std::make_shared<MockFS>(FileSystem::Default(), false);
|
|
|
|
|
|
|
|
std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
|
|
|
|
Options options;
|
|
|
|
SetGenericOptions(env.get(), /*use_direct_io=*/false, options);
|
|
|
|
options.statistics = CreateDBStatistics();
|
2024-10-17 22:52:55 +00:00
|
|
|
const std::string prefix = "my_key_";
|
|
|
|
options.prefix_extractor.reset(NewFixedPrefixTransform(prefix.size()));
|
2023-09-23 01:12:08 +00:00
|
|
|
BlockBasedTableOptions table_options;
|
|
|
|
SetBlockBasedTableOptions(table_options);
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
|
|
|
|
Status s = TryReopen(options);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
Random rnd(309);
|
|
|
|
WriteBatch batch;
|
|
|
|
|
2024-10-17 22:52:55 +00:00
|
|
|
// Create the DB with keys from "my_key_aaaaaaaaaa" to "my_key_zzzzzzzzzz"
|
2023-09-23 01:12:08 +00:00
|
|
|
for (int i = 0; i < 26; i++) {
|
2024-10-17 22:52:55 +00:00
|
|
|
std::string key = prefix;
|
2023-09-23 01:12:08 +00:00
|
|
|
|
|
|
|
for (int j = 0; j < 10; j++) {
|
|
|
|
key += char('a' + i);
|
|
|
|
ASSERT_OK(batch.Put(key, rnd.RandomString(1000)));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
ASSERT_OK(db_->Write(WriteOptions(), &batch));
|
|
|
|
|
2024-10-17 22:52:55 +00:00
|
|
|
std::string start_key = prefix + "a";
|
2023-09-23 01:12:08 +00:00
|
|
|
|
2024-10-17 22:52:55 +00:00
|
|
|
std::string end_key = prefix;
|
2023-09-23 01:12:08 +00:00
|
|
|
for (int j = 0; j < 10; j++) {
|
|
|
|
end_key += char('a' + 25);
|
|
|
|
}
|
|
|
|
|
|
|
|
Slice least(start_key.data(), start_key.size());
|
|
|
|
Slice greatest(end_key.data(), end_key.size());
|
|
|
|
|
|
|
|
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest));
|
|
|
|
|
|
|
|
// Try with different num_file_reads_for_auto_readahead from 0 to 3.
|
|
|
|
for (size_t i = 0; i < 3; i++) {
|
|
|
|
std::shared_ptr<Cache> cache = NewLRUCache(1024 * 1024, 2);
|
|
|
|
table_options.block_cache = cache;
|
|
|
|
table_options.no_block_cache = false;
|
|
|
|
table_options.num_file_reads_for_auto_readahead = i;
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
|
|
|
|
s = TryReopen(options);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// Warm up the cache.
|
|
|
|
{
|
|
|
|
auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
|
|
|
|
|
2024-10-17 22:52:55 +00:00
|
|
|
iter->Seek(prefix + "bbb");
|
2023-09-23 01:12:08 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
|
2024-10-17 22:52:55 +00:00
|
|
|
iter->Seek(prefix + "ccccccccc");
|
2023-09-23 01:12:08 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
|
2024-10-17 22:52:55 +00:00
|
|
|
iter->Seek(prefix + "ddd");
|
2023-09-23 01:12:08 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
|
2024-10-17 22:52:55 +00:00
|
|
|
iter->Seek(prefix + "ddddddd");
|
2023-09-23 01:12:08 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
|
2024-10-17 22:52:55 +00:00
|
|
|
iter->Seek(prefix + "e");
|
2023-09-23 01:12:08 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
|
2024-10-17 22:52:55 +00:00
|
|
|
iter->Seek(prefix + "eeeee");
|
2023-09-23 01:12:08 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
|
2024-10-17 22:52:55 +00:00
|
|
|
iter->Seek(prefix + "eeeeeeeee");
|
2023-09-23 01:12:08 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
}
|
|
|
|
|
|
|
|
ReadOptions ropts;
|
|
|
|
ReadOptions cmp_ro;
|
|
|
|
|
|
|
|
if (std::get<0>(GetParam())) {
|
|
|
|
ropts.readahead_size = cmp_ro.readahead_size = 32768;
|
|
|
|
}
|
|
|
|
|
2023-12-06 21:48:15 +00:00
|
|
|
if (std::get<1>(GetParam())) {
|
|
|
|
ropts.async_io = true;
|
|
|
|
}
|
|
|
|
|
2023-09-23 01:12:08 +00:00
|
|
|
// With and without tuning readahead_size.
|
2024-10-17 22:52:55 +00:00
|
|
|
ropts.auto_readahead_size = true;
|
|
|
|
cmp_ro.auto_readahead_size = false;
|
|
|
|
ASSERT_OK(options.statistics->Reset());
|
|
|
|
// Seek with a upper bound
|
|
|
|
const std::string seek_key_str = prefix + "aaa";
|
|
|
|
const Slice seek_key(seek_key_str);
|
|
|
|
const std::string ub_str = prefix + "uuu";
|
|
|
|
const Slice ub(ub_str);
|
|
|
|
VerifyScan(ropts /* iter_ro */, cmp_ro /* cmp_iter_ro */,
|
|
|
|
&seek_key /* seek_key */, &ub /* iterate_upper_bound */,
|
|
|
|
false /* prefix_same_as_start */);
|
|
|
|
|
|
|
|
// Seek with a new seek key and upper bound
|
|
|
|
const std::string seek_key_new_str = prefix + "v";
|
|
|
|
const Slice seek_key_new(seek_key_new_str);
|
|
|
|
const std::string ub_new_str = prefix + "y";
|
|
|
|
const Slice ub_new(ub_new_str);
|
|
|
|
VerifyScan(ropts /* iter_ro */, cmp_ro /* cmp_iter_ro */,
|
|
|
|
&seek_key_new /* seek_key */, &ub_new /* iterate_upper_bound */,
|
|
|
|
false /* prefix_same_as_start */);
|
|
|
|
|
|
|
|
// Seek with no upper bound, prefix_same_as_start = true
|
|
|
|
VerifyScan(ropts /* iter_ro */, cmp_ro /* cmp_iter_ro */,
|
|
|
|
&seek_key /* seek_key */, nullptr /* iterate_upper_bound */,
|
|
|
|
true /* prefix_same_as_start */);
|
2023-09-23 01:12:08 +00:00
|
|
|
Close();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(PrefetchTest, PrefetchWithBlockLookupAutoTuneWithPrev) {
|
|
|
|
if (mem_env_ || encrypted_env_) {
|
|
|
|
ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
// First param is if the mockFS support_prefetch or not
|
|
|
|
std::shared_ptr<MockFS> fs =
|
|
|
|
std::make_shared<MockFS>(FileSystem::Default(), false);
|
|
|
|
|
|
|
|
std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
|
|
|
|
Options options;
|
|
|
|
SetGenericOptions(env.get(), /*use_direct_io=*/false, options);
|
|
|
|
options.statistics = CreateDBStatistics();
|
2024-10-17 22:52:55 +00:00
|
|
|
const std::string prefix = "my_key_";
|
|
|
|
options.prefix_extractor.reset(NewFixedPrefixTransform(prefix.size()));
|
2023-09-23 01:12:08 +00:00
|
|
|
BlockBasedTableOptions table_options;
|
|
|
|
SetBlockBasedTableOptions(table_options);
|
|
|
|
std::shared_ptr<Cache> cache = NewLRUCache(1024 * 1024, 2);
|
|
|
|
table_options.block_cache = cache;
|
|
|
|
table_options.no_block_cache = false;
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
|
|
|
|
Status s = TryReopen(options);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
Random rnd(309);
|
|
|
|
WriteBatch batch;
|
|
|
|
|
|
|
|
for (int i = 0; i < 26; i++) {
|
2024-10-17 22:52:55 +00:00
|
|
|
std::string key = prefix;
|
2023-09-23 01:12:08 +00:00
|
|
|
|
|
|
|
for (int j = 0; j < 10; j++) {
|
|
|
|
key += char('a' + i);
|
|
|
|
ASSERT_OK(batch.Put(key, rnd.RandomString(1000)));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
ASSERT_OK(db_->Write(WriteOptions(), &batch));
|
|
|
|
|
2024-10-17 22:52:55 +00:00
|
|
|
std::string start_key = prefix + "a";
|
2023-09-23 01:12:08 +00:00
|
|
|
|
2024-10-17 22:52:55 +00:00
|
|
|
std::string end_key = prefix;
|
2023-09-23 01:12:08 +00:00
|
|
|
for (int j = 0; j < 10; j++) {
|
|
|
|
end_key += char('a' + 25);
|
|
|
|
}
|
|
|
|
|
|
|
|
Slice least(start_key.data(), start_key.size());
|
|
|
|
Slice greatest(end_key.data(), end_key.size());
|
|
|
|
|
|
|
|
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest));
|
|
|
|
|
|
|
|
ReadOptions ropts;
|
|
|
|
ropts.auto_readahead_size = true;
|
2024-10-17 22:52:55 +00:00
|
|
|
ReadOptions cmp_readopts = ropts;
|
|
|
|
cmp_readopts.auto_readahead_size = false;
|
|
|
|
|
|
|
|
const std::string seek_key_str = prefix + "bbb";
|
|
|
|
const Slice seek_key(seek_key_str);
|
|
|
|
const std::string ub_key = prefix + "uuu";
|
|
|
|
const Slice ub(ub_key);
|
|
|
|
|
|
|
|
VerifySeekPrevSeek(ropts /* iter_ro */, cmp_readopts /* cmp_iter_ro */,
|
|
|
|
&seek_key /* seek_key */, &ub /* iterate_upper_bound */,
|
|
|
|
false /* prefix_same_as_start */);
|
|
|
|
|
|
|
|
VerifySeekPrevSeek(ropts /* iter_ro */, cmp_readopts /* cmp_iter_ro */,
|
|
|
|
&seek_key /* seek_key */,
|
|
|
|
nullptr /* iterate_upper_bound */,
|
|
|
|
true /* prefix_same_as_start */);
|
|
|
|
Close();
|
|
|
|
}
|
2023-09-23 01:12:08 +00:00
|
|
|
|
2024-10-17 22:52:55 +00:00
|
|
|
class PrefetchTrimReadaheadTestParam
|
|
|
|
: public DBTestBase,
|
|
|
|
public ::testing::WithParamInterface<
|
|
|
|
std::tuple<BlockBasedTableOptions::IndexShorteningMode, bool>> {
|
|
|
|
public:
|
|
|
|
const std::string kPrefix = "a_prefix_";
|
|
|
|
Random rnd = Random(309);
|
2023-09-23 01:12:08 +00:00
|
|
|
|
2024-10-17 22:52:55 +00:00
|
|
|
PrefetchTrimReadaheadTestParam()
|
|
|
|
: DBTestBase("prefetch_trim_readahead_test_param", true) {}
|
|
|
|
virtual void SetGenericOptions(Env* env, Options& options) {
|
|
|
|
options = CurrentOptions();
|
|
|
|
options.env = env;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
options.disable_auto_compactions = true;
|
|
|
|
options.statistics = CreateDBStatistics();
|
2023-10-03 00:47:24 +00:00
|
|
|
|
2024-10-17 22:52:55 +00:00
|
|
|
// To make all the data bocks fit in one file for testing purpose
|
|
|
|
options.write_buffer_size = 1024 * 1024 * 1024;
|
|
|
|
options.prefix_extractor.reset(NewFixedPrefixTransform(kPrefix.size()));
|
|
|
|
}
|
2023-09-23 01:12:08 +00:00
|
|
|
|
2024-10-17 22:52:55 +00:00
|
|
|
void SetBlockBasedTableOptions(BlockBasedTableOptions& table_options) {
|
|
|
|
table_options.no_block_cache = false;
|
|
|
|
table_options.index_shortening = std::get<0>(GetParam());
|
2023-09-23 01:12:08 +00:00
|
|
|
|
2024-10-17 22:52:55 +00:00
|
|
|
// To force keys with different prefixes are in different data blocks of the
|
|
|
|
// file for testing purpose
|
|
|
|
table_options.block_size = 1;
|
|
|
|
table_options.flush_block_policy_factory.reset(
|
|
|
|
new FlushBlockBySizePolicyFactory());
|
|
|
|
}
|
|
|
|
};
|
2023-10-03 00:47:24 +00:00
|
|
|
|
2024-10-17 22:52:55 +00:00
|
|
|
INSTANTIATE_TEST_CASE_P(
|
|
|
|
PrefetchTrimReadaheadTestParam, PrefetchTrimReadaheadTestParam,
|
|
|
|
::testing::Combine(
|
|
|
|
// Params are as follows -
|
|
|
|
// Param 0 - TableOptions::index_shortening
|
|
|
|
// Param 2 - ReadOptinos::auto_readahead_size
|
|
|
|
::testing::Values(
|
|
|
|
BlockBasedTableOptions::IndexShorteningMode::kNoShortening,
|
|
|
|
BlockBasedTableOptions::IndexShorteningMode::kShortenSeparators,
|
|
|
|
BlockBasedTableOptions::IndexShorteningMode::
|
|
|
|
kShortenSeparatorsAndSuccessor),
|
|
|
|
::testing::Bool()));
|
|
|
|
|
|
|
|
TEST_P(PrefetchTrimReadaheadTestParam, PrefixSameAsStart) {
|
|
|
|
if (mem_env_ || encrypted_env_) {
|
|
|
|
ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
const bool auto_readahead_size = std::get<1>(GetParam());
|
2023-10-03 00:47:24 +00:00
|
|
|
|
2024-10-17 22:52:55 +00:00
|
|
|
std::shared_ptr<MockFS> fs = std::make_shared<MockFS>(
|
|
|
|
FileSystem::Default(), false /* support_prefetch */,
|
|
|
|
true /* small_buffer_alignment */);
|
|
|
|
std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
|
|
|
|
Options options;
|
|
|
|
SetGenericOptions(env.get(), options);
|
|
|
|
BlockBasedTableOptions table_optoins;
|
|
|
|
SetBlockBasedTableOptions(table_optoins);
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_optoins));
|
2023-10-03 00:47:24 +00:00
|
|
|
|
2024-10-17 22:52:55 +00:00
|
|
|
Status s = TryReopen(options);
|
|
|
|
ASSERT_OK(s);
|
2023-10-03 00:47:24 +00:00
|
|
|
|
2024-10-17 22:52:55 +00:00
|
|
|
// To create a DB with data block layout (denoted as "[...]" below ) as the
|
|
|
|
// following:
|
|
|
|
// ["a_prefix_0": random value]
|
|
|
|
// ["a_prefix_1": random value]
|
|
|
|
// ...
|
|
|
|
// ["a_prefix_9": random value]
|
|
|
|
// ["c_prefix_0": random value]
|
|
|
|
// ["d_prefix_1": random value]
|
|
|
|
// ...
|
|
|
|
// ["l_prefix_9": random value]
|
|
|
|
//
|
|
|
|
// We want to verify keys not with prefix "a_prefix_" are not prefetched due
|
|
|
|
// to trimming
|
|
|
|
WriteBatch prefix_batch;
|
|
|
|
for (int i = 0; i < 10; i++) {
|
|
|
|
std::string key = kPrefix + std::to_string(i);
|
|
|
|
ASSERT_OK(prefix_batch.Put(key, rnd.RandomString(100)));
|
|
|
|
}
|
|
|
|
ASSERT_OK(db_->Write(WriteOptions(), &prefix_batch));
|
2023-09-23 01:12:08 +00:00
|
|
|
|
2024-10-17 22:52:55 +00:00
|
|
|
WriteBatch diff_prefix_batch;
|
|
|
|
for (int i = 0; i < 10; i++) {
|
|
|
|
std::string diff_prefix = std::string(1, char('c' + i)) + kPrefix.substr(1);
|
|
|
|
std::string key = diff_prefix + std::to_string(i);
|
|
|
|
ASSERT_OK(diff_prefix_batch.Put(key, rnd.RandomString(100)));
|
|
|
|
}
|
|
|
|
ASSERT_OK(db_->Write(WriteOptions(), &diff_prefix_batch));
|
2023-10-03 00:47:24 +00:00
|
|
|
|
2024-10-17 22:52:55 +00:00
|
|
|
ASSERT_OK(db_->Flush(FlushOptions()));
|
|
|
|
|
|
|
|
// To verify readahead is trimmed based on prefix by checking the counter
|
|
|
|
// READAHEAD_TRIMMED
|
|
|
|
ReadOptions ro;
|
|
|
|
ro.prefix_same_as_start = true;
|
|
|
|
ro.auto_readahead_size = auto_readahead_size;
|
|
|
|
// Set a large readahead size to introduce readahead waste when without
|
|
|
|
// trimming based on prefix
|
|
|
|
ro.readahead_size = 1024 * 1024 * 1024;
|
|
|
|
|
|
|
|
ASSERT_OK(options.statistics->Reset());
|
|
|
|
{
|
|
|
|
auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ro));
|
|
|
|
for (iter->Seek(kPrefix); iter->status().ok() && iter->Valid();
|
|
|
|
iter->Next()) {
|
2023-10-03 00:47:24 +00:00
|
|
|
}
|
2023-09-23 01:12:08 +00:00
|
|
|
}
|
2024-10-17 22:52:55 +00:00
|
|
|
|
|
|
|
auto readahead_trimmed =
|
|
|
|
options.statistics->getTickerCount(READAHEAD_TRIMMED);
|
|
|
|
|
|
|
|
if (auto_readahead_size) {
|
|
|
|
ASSERT_GT(readahead_trimmed, 0);
|
|
|
|
} else {
|
|
|
|
ASSERT_EQ(readahead_trimmed, 0);
|
|
|
|
}
|
2023-09-23 01:12:08 +00:00
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
2023-01-20 18:17:57 +00:00
|
|
|
// This test verifies the functionality of ReadOptions.adaptive_readahead.
|
2022-07-06 18:42:59 +00:00
|
|
|
TEST_P(PrefetchTest, DBIterLevelReadAhead) {
|
2021-11-11 00:18:27 +00:00
|
|
|
const int kNumKeys = 1000;
|
|
|
|
// Set options
|
|
|
|
std::shared_ptr<MockFS> fs =
|
|
|
|
std::make_shared<MockFS>(env_->GetFileSystem(), false);
|
|
|
|
std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
|
|
|
|
|
2022-07-06 18:42:59 +00:00
|
|
|
bool use_direct_io = std::get<0>(GetParam());
|
2021-12-01 06:52:14 +00:00
|
|
|
bool is_adaptive_readahead = std::get<1>(GetParam());
|
2022-07-06 18:42:59 +00:00
|
|
|
|
2023-01-20 18:17:57 +00:00
|
|
|
Options options;
|
|
|
|
SetGenericOptions(env.get(), use_direct_io, options);
|
2022-04-06 21:26:53 +00:00
|
|
|
options.statistics = CreateDBStatistics();
|
2021-11-11 00:18:27 +00:00
|
|
|
BlockBasedTableOptions table_options;
|
2023-01-20 18:17:57 +00:00
|
|
|
SetBlockBasedTableOptions(table_options);
|
2021-11-11 00:18:27 +00:00
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
|
|
|
|
Status s = TryReopen(options);
|
2022-04-06 21:26:53 +00:00
|
|
|
if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) {
|
2021-11-11 00:18:27 +00:00
|
|
|
// If direct IO is not supported, skip the test
|
|
|
|
return;
|
|
|
|
} else {
|
|
|
|
ASSERT_OK(s);
|
|
|
|
}
|
|
|
|
|
|
|
|
WriteBatch batch;
|
|
|
|
Random rnd(309);
|
2022-03-21 14:12:43 +00:00
|
|
|
int total_keys = 0;
|
2021-11-11 00:18:27 +00:00
|
|
|
for (int j = 0; j < 5; j++) {
|
|
|
|
for (int i = j * kNumKeys; i < (j + 1) * kNumKeys; i++) {
|
|
|
|
ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000)));
|
2022-03-21 14:12:43 +00:00
|
|
|
total_keys++;
|
2021-11-11 00:18:27 +00:00
|
|
|
}
|
|
|
|
ASSERT_OK(db_->Write(WriteOptions(), &batch));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
MoveFilesToLevel(2);
|
|
|
|
int buff_prefetch_count = 0;
|
|
|
|
int readahead_carry_over_count = 0;
|
|
|
|
int num_sst_files = NumTableFilesAtLevel(2);
|
|
|
|
size_t current_readahead_size = 0;
|
|
|
|
|
|
|
|
// Test - Iterate over the keys sequentially.
|
|
|
|
{
|
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"FilePrefetchBuffer::Prefetch:Start",
|
|
|
|
[&](void*) { buff_prefetch_count++; });
|
|
|
|
|
2023-01-20 18:17:57 +00:00
|
|
|
// The callback checks, since reads are sequential, readahead_size doesn't
|
|
|
|
// start from 8KB when iterator moves to next file and its called
|
|
|
|
// num_sst_files-1 times (excluding for first file).
|
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"BlockPrefetcher::SetReadaheadState", [&](void* arg) {
|
|
|
|
readahead_carry_over_count++;
|
Prefer static_cast in place of most reinterpret_cast (#12308)
Summary:
The following are risks associated with pointer-to-pointer reinterpret_cast:
* Can produce the "wrong result" (crash or memory corruption). IIRC, in theory this can happen for any up-cast or down-cast for a non-standard-layout type, though in practice would only happen for multiple inheritance cases (where the base class pointer might be "inside" the derived object). We don't use multiple inheritance a lot, but we do.
* Can mask useful compiler errors upon code change, including converting between unrelated pointer types that you are expecting to be related, and converting between pointer and scalar types unintentionally.
I can only think of some obscure cases where static_cast could be troublesome when it compiles as a replacement:
* Going through `void*` could plausibly cause unnecessary or broken pointer arithmetic. Suppose we have
`struct Derived: public Base1, public Base2`. If we have `Derived*` -> `void*` -> `Base2*` -> `Derived*` through reinterpret casts, this could plausibly work (though technical UB) assuming the `Base2*` is not dereferenced. Changing to static cast could introduce breaking pointer arithmetic.
* Unnecessary (but safe) pointer arithmetic could arise in a case like `Derived*` -> `Base2*` -> `Derived*` where before the Base2 pointer might not have been dereferenced. This could potentially affect performance.
With some light scripting, I tried replacing pointer-to-pointer reinterpret_casts with static_cast and kept the cases that still compile. Most occurrences of reinterpret_cast have successfully been changed (except for java/ and third-party/). 294 changed, 257 remain.
A couple of related interventions included here:
* Previously Cache::Handle was not actually derived from in the implementations and just used as a `void*` stand-in with reinterpret_cast. Now there is a relationship to allow static_cast. In theory, this could introduce pointer arithmetic (as described above) but is unlikely without multiple inheritance AND non-empty Cache::Handle.
* Remove some unnecessary casts to void* as this is allowed to be implicit (for better or worse).
Most of the remaining reinterpret_casts are for converting to/from raw bytes of objects. We could consider better idioms for these patterns in follow-up work.
I wish there were a way to implement a template variant of static_cast that would only compile if no pointer arithmetic is generated, but best I can tell, this is not possible. AFAIK the best you could do is a dynamic check that the void* conversion after the static cast is unchanged.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12308
Test Plan: existing tests, CI
Reviewed By: ltamasi
Differential Revision: D53204947
Pulled By: pdillinger
fbshipit-source-id: 9de23e618263b0d5b9820f4e15966876888a16e2
2024-02-07 18:44:11 +00:00
|
|
|
size_t readahead_size = *static_cast<size_t*>(arg);
|
2023-01-20 18:17:57 +00:00
|
|
|
if (readahead_carry_over_count) {
|
|
|
|
ASSERT_GT(readahead_size, 8 * 1024);
|
|
|
|
}
|
|
|
|
});
|
|
|
|
|
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"FilePrefetchBuffer::TryReadFromCache", [&](void* arg) {
|
Prefer static_cast in place of most reinterpret_cast (#12308)
Summary:
The following are risks associated with pointer-to-pointer reinterpret_cast:
* Can produce the "wrong result" (crash or memory corruption). IIRC, in theory this can happen for any up-cast or down-cast for a non-standard-layout type, though in practice would only happen for multiple inheritance cases (where the base class pointer might be "inside" the derived object). We don't use multiple inheritance a lot, but we do.
* Can mask useful compiler errors upon code change, including converting between unrelated pointer types that you are expecting to be related, and converting between pointer and scalar types unintentionally.
I can only think of some obscure cases where static_cast could be troublesome when it compiles as a replacement:
* Going through `void*` could plausibly cause unnecessary or broken pointer arithmetic. Suppose we have
`struct Derived: public Base1, public Base2`. If we have `Derived*` -> `void*` -> `Base2*` -> `Derived*` through reinterpret casts, this could plausibly work (though technical UB) assuming the `Base2*` is not dereferenced. Changing to static cast could introduce breaking pointer arithmetic.
* Unnecessary (but safe) pointer arithmetic could arise in a case like `Derived*` -> `Base2*` -> `Derived*` where before the Base2 pointer might not have been dereferenced. This could potentially affect performance.
With some light scripting, I tried replacing pointer-to-pointer reinterpret_casts with static_cast and kept the cases that still compile. Most occurrences of reinterpret_cast have successfully been changed (except for java/ and third-party/). 294 changed, 257 remain.
A couple of related interventions included here:
* Previously Cache::Handle was not actually derived from in the implementations and just used as a `void*` stand-in with reinterpret_cast. Now there is a relationship to allow static_cast. In theory, this could introduce pointer arithmetic (as described above) but is unlikely without multiple inheritance AND non-empty Cache::Handle.
* Remove some unnecessary casts to void* as this is allowed to be implicit (for better or worse).
Most of the remaining reinterpret_casts are for converting to/from raw bytes of objects. We could consider better idioms for these patterns in follow-up work.
I wish there were a way to implement a template variant of static_cast that would only compile if no pointer arithmetic is generated, but best I can tell, this is not possible. AFAIK the best you could do is a dynamic check that the void* conversion after the static cast is unchanged.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12308
Test Plan: existing tests, CI
Reviewed By: ltamasi
Differential Revision: D53204947
Pulled By: pdillinger
fbshipit-source-id: 9de23e618263b0d5b9820f4e15966876888a16e2
2024-02-07 18:44:11 +00:00
|
|
|
current_readahead_size = *static_cast<size_t*>(arg);
|
2023-01-20 18:17:57 +00:00
|
|
|
ASSERT_GT(current_readahead_size, 0);
|
|
|
|
});
|
|
|
|
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
ReadOptions ro;
|
|
|
|
if (is_adaptive_readahead) {
|
|
|
|
ro.adaptive_readahead = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
ASSERT_OK(options.statistics->Reset());
|
|
|
|
|
|
|
|
auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ro));
|
|
|
|
int num_keys = 0;
|
|
|
|
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
num_keys++;
|
|
|
|
}
|
2023-10-18 16:38:38 +00:00
|
|
|
ASSERT_OK(iter->status());
|
2023-01-20 18:17:57 +00:00
|
|
|
ASSERT_EQ(num_keys, total_keys);
|
|
|
|
|
|
|
|
// For index and data blocks.
|
|
|
|
if (is_adaptive_readahead) {
|
|
|
|
ASSERT_EQ(readahead_carry_over_count, 2 * (num_sst_files - 1));
|
|
|
|
} else {
|
|
|
|
ASSERT_GT(buff_prefetch_count, 0);
|
|
|
|
ASSERT_EQ(readahead_carry_over_count, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
SyncPoint::GetInstance()->ClearAllCallBacks();
|
|
|
|
}
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
|
|
|
// This test verifies the functionality of ReadOptions.adaptive_readahead when
|
|
|
|
// async_io is enabled.
|
|
|
|
TEST_P(PrefetchTest, DBIterLevelReadAheadWithAsyncIO) {
|
2023-06-23 18:48:49 +00:00
|
|
|
if (mem_env_ || encrypted_env_) {
|
|
|
|
ROCKSDB_GTEST_BYPASS("Test requires non-mem or non-encrypted environment");
|
|
|
|
return;
|
|
|
|
}
|
2023-01-20 18:17:57 +00:00
|
|
|
const int kNumKeys = 1000;
|
|
|
|
// Set options
|
|
|
|
std::shared_ptr<MockFS> fs =
|
2023-06-23 18:48:49 +00:00
|
|
|
std::make_shared<MockFS>(FileSystem::Default(), false);
|
2023-01-20 18:17:57 +00:00
|
|
|
std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
|
|
|
|
|
|
|
|
bool use_direct_io = std::get<0>(GetParam());
|
|
|
|
bool is_adaptive_readahead = std::get<1>(GetParam());
|
|
|
|
|
|
|
|
Options options;
|
|
|
|
SetGenericOptions(env.get(), use_direct_io, options);
|
|
|
|
options.statistics = CreateDBStatistics();
|
|
|
|
BlockBasedTableOptions table_options;
|
|
|
|
SetBlockBasedTableOptions(table_options);
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
|
|
|
|
Status s = TryReopen(options);
|
|
|
|
if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) {
|
|
|
|
// If direct IO is not supported, skip the test
|
|
|
|
return;
|
|
|
|
} else {
|
|
|
|
ASSERT_OK(s);
|
|
|
|
}
|
|
|
|
|
|
|
|
WriteBatch batch;
|
|
|
|
Random rnd(309);
|
|
|
|
int total_keys = 0;
|
|
|
|
for (int j = 0; j < 5; j++) {
|
|
|
|
for (int i = j * kNumKeys; i < (j + 1) * kNumKeys; i++) {
|
|
|
|
ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000)));
|
|
|
|
total_keys++;
|
|
|
|
}
|
|
|
|
ASSERT_OK(db_->Write(WriteOptions(), &batch));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
MoveFilesToLevel(2);
|
2023-06-23 18:48:49 +00:00
|
|
|
int buff_prefetch_count = 0;
|
2023-01-20 18:17:57 +00:00
|
|
|
int readahead_carry_over_count = 0;
|
|
|
|
int num_sst_files = NumTableFilesAtLevel(2);
|
|
|
|
size_t current_readahead_size = 0;
|
2023-06-23 18:48:49 +00:00
|
|
|
bool read_async_called = false;
|
2023-01-20 18:17:57 +00:00
|
|
|
|
|
|
|
// Test - Iterate over the keys sequentially.
|
|
|
|
{
|
2023-06-23 18:48:49 +00:00
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"FilePrefetchBuffer::Prefetch:Start",
|
|
|
|
[&](void*) { buff_prefetch_count++; });
|
|
|
|
|
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"UpdateResults::io_uring_result",
|
|
|
|
[&](void* /*arg*/) { read_async_called = true; });
|
|
|
|
|
2021-11-11 00:18:27 +00:00
|
|
|
// The callback checks, since reads are sequential, readahead_size doesn't
|
|
|
|
// start from 8KB when iterator moves to next file and its called
|
|
|
|
// num_sst_files-1 times (excluding for first file).
|
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"BlockPrefetcher::SetReadaheadState", [&](void* arg) {
|
|
|
|
readahead_carry_over_count++;
|
Prefer static_cast in place of most reinterpret_cast (#12308)
Summary:
The following are risks associated with pointer-to-pointer reinterpret_cast:
* Can produce the "wrong result" (crash or memory corruption). IIRC, in theory this can happen for any up-cast or down-cast for a non-standard-layout type, though in practice would only happen for multiple inheritance cases (where the base class pointer might be "inside" the derived object). We don't use multiple inheritance a lot, but we do.
* Can mask useful compiler errors upon code change, including converting between unrelated pointer types that you are expecting to be related, and converting between pointer and scalar types unintentionally.
I can only think of some obscure cases where static_cast could be troublesome when it compiles as a replacement:
* Going through `void*` could plausibly cause unnecessary or broken pointer arithmetic. Suppose we have
`struct Derived: public Base1, public Base2`. If we have `Derived*` -> `void*` -> `Base2*` -> `Derived*` through reinterpret casts, this could plausibly work (though technical UB) assuming the `Base2*` is not dereferenced. Changing to static cast could introduce breaking pointer arithmetic.
* Unnecessary (but safe) pointer arithmetic could arise in a case like `Derived*` -> `Base2*` -> `Derived*` where before the Base2 pointer might not have been dereferenced. This could potentially affect performance.
With some light scripting, I tried replacing pointer-to-pointer reinterpret_casts with static_cast and kept the cases that still compile. Most occurrences of reinterpret_cast have successfully been changed (except for java/ and third-party/). 294 changed, 257 remain.
A couple of related interventions included here:
* Previously Cache::Handle was not actually derived from in the implementations and just used as a `void*` stand-in with reinterpret_cast. Now there is a relationship to allow static_cast. In theory, this could introduce pointer arithmetic (as described above) but is unlikely without multiple inheritance AND non-empty Cache::Handle.
* Remove some unnecessary casts to void* as this is allowed to be implicit (for better or worse).
Most of the remaining reinterpret_casts are for converting to/from raw bytes of objects. We could consider better idioms for these patterns in follow-up work.
I wish there were a way to implement a template variant of static_cast that would only compile if no pointer arithmetic is generated, but best I can tell, this is not possible. AFAIK the best you could do is a dynamic check that the void* conversion after the static cast is unchanged.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12308
Test Plan: existing tests, CI
Reviewed By: ltamasi
Differential Revision: D53204947
Pulled By: pdillinger
fbshipit-source-id: 9de23e618263b0d5b9820f4e15966876888a16e2
2024-02-07 18:44:11 +00:00
|
|
|
size_t readahead_size = *static_cast<size_t*>(arg);
|
2021-11-11 00:18:27 +00:00
|
|
|
if (readahead_carry_over_count) {
|
|
|
|
ASSERT_GT(readahead_size, 8 * 1024);
|
|
|
|
}
|
|
|
|
});
|
|
|
|
|
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"FilePrefetchBuffer::TryReadFromCache", [&](void* arg) {
|
Prefer static_cast in place of most reinterpret_cast (#12308)
Summary:
The following are risks associated with pointer-to-pointer reinterpret_cast:
* Can produce the "wrong result" (crash or memory corruption). IIRC, in theory this can happen for any up-cast or down-cast for a non-standard-layout type, though in practice would only happen for multiple inheritance cases (where the base class pointer might be "inside" the derived object). We don't use multiple inheritance a lot, but we do.
* Can mask useful compiler errors upon code change, including converting between unrelated pointer types that you are expecting to be related, and converting between pointer and scalar types unintentionally.
I can only think of some obscure cases where static_cast could be troublesome when it compiles as a replacement:
* Going through `void*` could plausibly cause unnecessary or broken pointer arithmetic. Suppose we have
`struct Derived: public Base1, public Base2`. If we have `Derived*` -> `void*` -> `Base2*` -> `Derived*` through reinterpret casts, this could plausibly work (though technical UB) assuming the `Base2*` is not dereferenced. Changing to static cast could introduce breaking pointer arithmetic.
* Unnecessary (but safe) pointer arithmetic could arise in a case like `Derived*` -> `Base2*` -> `Derived*` where before the Base2 pointer might not have been dereferenced. This could potentially affect performance.
With some light scripting, I tried replacing pointer-to-pointer reinterpret_casts with static_cast and kept the cases that still compile. Most occurrences of reinterpret_cast have successfully been changed (except for java/ and third-party/). 294 changed, 257 remain.
A couple of related interventions included here:
* Previously Cache::Handle was not actually derived from in the implementations and just used as a `void*` stand-in with reinterpret_cast. Now there is a relationship to allow static_cast. In theory, this could introduce pointer arithmetic (as described above) but is unlikely without multiple inheritance AND non-empty Cache::Handle.
* Remove some unnecessary casts to void* as this is allowed to be implicit (for better or worse).
Most of the remaining reinterpret_casts are for converting to/from raw bytes of objects. We could consider better idioms for these patterns in follow-up work.
I wish there were a way to implement a template variant of static_cast that would only compile if no pointer arithmetic is generated, but best I can tell, this is not possible. AFAIK the best you could do is a dynamic check that the void* conversion after the static cast is unchanged.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12308
Test Plan: existing tests, CI
Reviewed By: ltamasi
Differential Revision: D53204947
Pulled By: pdillinger
fbshipit-source-id: 9de23e618263b0d5b9820f4e15966876888a16e2
2024-02-07 18:44:11 +00:00
|
|
|
current_readahead_size = *static_cast<size_t*>(arg);
|
2021-12-01 06:52:14 +00:00
|
|
|
ASSERT_GT(current_readahead_size, 0);
|
2021-11-11 00:18:27 +00:00
|
|
|
});
|
|
|
|
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
ReadOptions ro;
|
2021-12-01 06:52:14 +00:00
|
|
|
if (is_adaptive_readahead) {
|
|
|
|
ro.adaptive_readahead = true;
|
|
|
|
}
|
2023-01-20 18:17:57 +00:00
|
|
|
ro.async_io = true;
|
2022-04-06 21:26:53 +00:00
|
|
|
|
|
|
|
ASSERT_OK(options.statistics->Reset());
|
2022-09-13 00:42:01 +00:00
|
|
|
|
2021-11-11 00:18:27 +00:00
|
|
|
auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ro));
|
|
|
|
int num_keys = 0;
|
|
|
|
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
2022-03-21 14:12:43 +00:00
|
|
|
ASSERT_OK(iter->status());
|
2021-11-11 00:18:27 +00:00
|
|
|
num_keys++;
|
|
|
|
}
|
2023-10-18 16:38:38 +00:00
|
|
|
ASSERT_OK(iter->status());
|
2022-03-21 14:12:43 +00:00
|
|
|
ASSERT_EQ(num_keys, total_keys);
|
2022-04-11 20:46:57 +00:00
|
|
|
|
2021-11-11 00:18:27 +00:00
|
|
|
// For index and data blocks.
|
2021-12-01 06:52:14 +00:00
|
|
|
if (is_adaptive_readahead) {
|
|
|
|
ASSERT_EQ(readahead_carry_over_count, 2 * (num_sst_files - 1));
|
|
|
|
} else {
|
|
|
|
ASSERT_EQ(readahead_carry_over_count, 0);
|
|
|
|
}
|
2022-04-06 21:26:53 +00:00
|
|
|
|
|
|
|
// Check stats to make sure async prefetch is done.
|
|
|
|
{
|
|
|
|
HistogramData async_read_bytes;
|
|
|
|
options.statistics->histogramData(ASYNC_READ_BYTES, &async_read_bytes);
|
2023-06-23 18:48:49 +00:00
|
|
|
// Not all platforms support iouring. In that case, ReadAsync in posix
|
|
|
|
// won't submit async requests.
|
|
|
|
if (read_async_called) {
|
2024-01-05 17:29:01 +00:00
|
|
|
ASSERT_GT(buff_prefetch_count, 0);
|
2022-04-06 21:26:53 +00:00
|
|
|
ASSERT_GT(async_read_bytes.count, 0);
|
|
|
|
} else {
|
2023-06-23 18:48:49 +00:00
|
|
|
ASSERT_GT(buff_prefetch_count, 0);
|
2022-04-06 21:26:53 +00:00
|
|
|
ASSERT_EQ(async_read_bytes.count, 0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-11-11 00:18:27 +00:00
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
SyncPoint::GetInstance()->ClearAllCallBacks();
|
|
|
|
}
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
2023-09-18 18:32:30 +00:00
|
|
|
TEST_P(PrefetchTest, AvoidBlockCacheLookupTwice) {
|
|
|
|
const int kNumKeys = 1000;
|
|
|
|
// Set options
|
|
|
|
std::shared_ptr<MockFS> fs =
|
|
|
|
std::make_shared<MockFS>(env_->GetFileSystem(), false);
|
|
|
|
std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
|
|
|
|
|
|
|
|
bool use_direct_io = std::get<0>(GetParam());
|
|
|
|
bool async_io = std::get<1>(GetParam());
|
|
|
|
|
|
|
|
Options options;
|
|
|
|
SetGenericOptions(env.get(), use_direct_io, options);
|
|
|
|
options.statistics = CreateDBStatistics();
|
|
|
|
BlockBasedTableOptions table_options;
|
|
|
|
SetBlockBasedTableOptions(table_options);
|
|
|
|
std::shared_ptr<Cache> cache = NewLRUCache(4 * 1024 * 1024, 2); // 8MB
|
|
|
|
table_options.block_cache = cache;
|
|
|
|
table_options.no_block_cache = false;
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
|
|
|
|
Status s = TryReopen(options);
|
|
|
|
if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) {
|
|
|
|
// If direct IO is not supported, skip the test
|
|
|
|
return;
|
|
|
|
} else {
|
|
|
|
ASSERT_OK(s);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Write to DB.
|
|
|
|
{
|
|
|
|
WriteBatch batch;
|
|
|
|
Random rnd(309);
|
|
|
|
for (int i = 0; i < kNumKeys; i++) {
|
|
|
|
ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000)));
|
|
|
|
}
|
|
|
|
ASSERT_OK(db_->Write(WriteOptions(), &batch));
|
|
|
|
|
|
|
|
std::string start_key = BuildKey(0);
|
|
|
|
std::string end_key = BuildKey(kNumKeys - 1);
|
|
|
|
Slice least(start_key.data(), start_key.size());
|
|
|
|
Slice greatest(end_key.data(), end_key.size());
|
|
|
|
|
|
|
|
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest));
|
|
|
|
}
|
|
|
|
|
|
|
|
ReadOptions ro;
|
|
|
|
ro.async_io = async_io;
|
|
|
|
// Iterate over the keys.
|
|
|
|
{
|
|
|
|
// Each block contains around 4 keys.
|
|
|
|
auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ro));
|
|
|
|
ASSERT_OK(options.statistics->Reset());
|
|
|
|
|
|
|
|
iter->Seek(BuildKey(99)); // Prefetch data because of seek parallelization.
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
|
|
|
|
ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_MISS),
|
|
|
|
1);
|
|
|
|
}
|
|
|
|
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
2023-03-17 21:57:09 +00:00
|
|
|
TEST_P(PrefetchTest, DBIterAsyncIONoIOUring) {
|
|
|
|
if (mem_env_ || encrypted_env_) {
|
|
|
|
ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
const int kNumKeys = 1000;
|
|
|
|
// Set options
|
|
|
|
bool use_direct_io = std::get<0>(GetParam());
|
|
|
|
bool is_adaptive_readahead = std::get<1>(GetParam());
|
|
|
|
|
|
|
|
Options options;
|
|
|
|
SetGenericOptions(Env::Default(), use_direct_io, options);
|
|
|
|
options.statistics = CreateDBStatistics();
|
|
|
|
BlockBasedTableOptions table_options;
|
|
|
|
SetBlockBasedTableOptions(table_options);
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
|
|
|
|
enable_io_uring = false;
|
|
|
|
Status s = TryReopen(options);
|
|
|
|
if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) {
|
|
|
|
// If direct IO is not supported, skip the test
|
2023-06-23 18:48:49 +00:00
|
|
|
enable_io_uring = true;
|
2023-03-17 21:57:09 +00:00
|
|
|
return;
|
|
|
|
} else {
|
|
|
|
ASSERT_OK(s);
|
|
|
|
}
|
|
|
|
|
|
|
|
WriteBatch batch;
|
|
|
|
Random rnd(309);
|
|
|
|
int total_keys = 0;
|
|
|
|
for (int j = 0; j < 5; j++) {
|
|
|
|
for (int i = j * kNumKeys; i < (j + 1) * kNumKeys; i++) {
|
|
|
|
ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000)));
|
|
|
|
total_keys++;
|
|
|
|
}
|
|
|
|
ASSERT_OK(db_->Write(WriteOptions(), &batch));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
MoveFilesToLevel(2);
|
|
|
|
|
|
|
|
// Test - Iterate over the keys sequentially.
|
|
|
|
{
|
|
|
|
ReadOptions ro;
|
|
|
|
if (is_adaptive_readahead) {
|
|
|
|
ro.adaptive_readahead = true;
|
|
|
|
}
|
|
|
|
ro.async_io = true;
|
|
|
|
|
|
|
|
ASSERT_OK(options.statistics->Reset());
|
|
|
|
|
|
|
|
auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ro));
|
|
|
|
int num_keys = 0;
|
|
|
|
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
num_keys++;
|
|
|
|
}
|
2023-10-18 16:38:38 +00:00
|
|
|
ASSERT_OK(iter->status());
|
2023-03-17 21:57:09 +00:00
|
|
|
ASSERT_EQ(num_keys, total_keys);
|
|
|
|
|
|
|
|
// Check stats to make sure async prefetch is done.
|
|
|
|
{
|
|
|
|
HistogramData async_read_bytes;
|
|
|
|
options.statistics->histogramData(ASYNC_READ_BYTES, &async_read_bytes);
|
|
|
|
ASSERT_EQ(async_read_bytes.count, 0);
|
|
|
|
ASSERT_EQ(options.statistics->getTickerCount(READ_ASYNC_MICROS), 0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
ReadOptions ro;
|
|
|
|
if (is_adaptive_readahead) {
|
|
|
|
ro.adaptive_readahead = true;
|
|
|
|
}
|
|
|
|
ro.async_io = true;
|
|
|
|
ro.tailing = true;
|
|
|
|
|
|
|
|
ASSERT_OK(options.statistics->Reset());
|
|
|
|
|
|
|
|
auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ro));
|
|
|
|
int num_keys = 0;
|
|
|
|
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
num_keys++;
|
|
|
|
}
|
2023-10-18 16:38:38 +00:00
|
|
|
ASSERT_OK(iter->status());
|
2023-03-17 21:57:09 +00:00
|
|
|
ASSERT_EQ(num_keys, total_keys);
|
|
|
|
|
|
|
|
// Check stats to make sure async prefetch is done.
|
|
|
|
{
|
|
|
|
HistogramData async_read_bytes;
|
|
|
|
options.statistics->histogramData(ASYNC_READ_BYTES, &async_read_bytes);
|
|
|
|
ASSERT_EQ(async_read_bytes.count, 0);
|
|
|
|
ASSERT_EQ(options.statistics->getTickerCount(READ_ASYNC_MICROS), 0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
Close();
|
|
|
|
|
|
|
|
enable_io_uring = true;
|
|
|
|
}
|
|
|
|
|
2022-07-06 18:42:59 +00:00
|
|
|
class PrefetchTest1 : public DBTestBase,
|
2021-12-01 06:52:14 +00:00
|
|
|
public ::testing::WithParamInterface<bool> {
|
|
|
|
public:
|
2022-07-06 18:42:59 +00:00
|
|
|
PrefetchTest1() : DBTestBase("prefetch_test1", true) {}
|
2023-01-20 18:17:57 +00:00
|
|
|
|
2023-06-23 18:48:49 +00:00
|
|
|
virtual void SetGenericOptions(Env* env, bool use_direct_io,
|
|
|
|
Options& options) {
|
2023-01-20 18:17:57 +00:00
|
|
|
options = CurrentOptions();
|
|
|
|
options.write_buffer_size = 1024;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
options.compression = kNoCompression;
|
|
|
|
options.env = env;
|
|
|
|
options.disable_auto_compactions = true;
|
|
|
|
if (use_direct_io) {
|
|
|
|
options.use_direct_reads = true;
|
|
|
|
options.use_direct_io_for_flush_and_compaction = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void SetBlockBasedTableOptions(BlockBasedTableOptions& table_options) {
|
|
|
|
table_options.no_block_cache = true;
|
|
|
|
table_options.cache_index_and_filter_blocks = false;
|
|
|
|
table_options.metadata_block_size = 1024;
|
|
|
|
table_options.index_type =
|
|
|
|
BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
|
|
|
|
}
|
2021-12-01 06:52:14 +00:00
|
|
|
};
|
|
|
|
|
2022-07-06 18:42:59 +00:00
|
|
|
INSTANTIATE_TEST_CASE_P(PrefetchTest1, PrefetchTest1, ::testing::Bool());
|
2021-12-01 06:52:14 +00:00
|
|
|
|
2023-06-26 17:39:44 +00:00
|
|
|
TEST_P(PrefetchTest1, SeekWithExtraPrefetchAsyncIO) {
|
|
|
|
const int kNumKeys = 2000;
|
|
|
|
// Set options
|
|
|
|
std::shared_ptr<MockFS> fs =
|
|
|
|
std::make_shared<MockFS>(env_->GetFileSystem(), false);
|
|
|
|
std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
|
|
|
|
|
|
|
|
Options options;
|
|
|
|
SetGenericOptions(env.get(), GetParam(), options);
|
|
|
|
options.statistics = CreateDBStatistics();
|
|
|
|
BlockBasedTableOptions table_options;
|
|
|
|
SetBlockBasedTableOptions(table_options);
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
|
|
|
|
Status s = TryReopen(options);
|
|
|
|
if (GetParam() && (s.IsNotSupported() || s.IsInvalidArgument())) {
|
|
|
|
// If direct IO is not supported, skip the test
|
|
|
|
return;
|
|
|
|
} else {
|
|
|
|
ASSERT_OK(s);
|
|
|
|
}
|
|
|
|
|
|
|
|
WriteBatch batch;
|
|
|
|
Random rnd(309);
|
|
|
|
for (int i = 0; i < kNumKeys; i++) {
|
|
|
|
ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000)));
|
|
|
|
}
|
|
|
|
ASSERT_OK(db_->Write(WriteOptions(), &batch));
|
|
|
|
|
|
|
|
std::string start_key = BuildKey(0);
|
|
|
|
std::string end_key = BuildKey(kNumKeys - 1);
|
|
|
|
Slice least(start_key.data(), start_key.size());
|
|
|
|
Slice greatest(end_key.data(), end_key.size());
|
|
|
|
|
|
|
|
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest));
|
|
|
|
Close();
|
2024-01-05 17:29:01 +00:00
|
|
|
int buff_prefetch_count = 0, extra_prefetch_buff_cnt = 0;
|
2023-06-26 17:39:44 +00:00
|
|
|
for (size_t i = 0; i < 3; i++) {
|
|
|
|
table_options.num_file_reads_for_auto_readahead = i;
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
|
|
|
|
s = TryReopen(options);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
2024-01-05 17:29:01 +00:00
|
|
|
buff_prefetch_count = 0;
|
|
|
|
extra_prefetch_buff_cnt = 0;
|
2023-06-26 17:39:44 +00:00
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"FilePrefetchBuffer::PrefetchAsync:ExtraPrefetching",
|
|
|
|
[&](void*) { extra_prefetch_buff_cnt++; });
|
|
|
|
|
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
2024-01-05 17:29:01 +00:00
|
|
|
"FilePrefetchBuffer::Prefetch:Start",
|
2023-06-26 17:39:44 +00:00
|
|
|
[&](void*) { buff_prefetch_count++; });
|
|
|
|
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
ReadOptions ro;
|
|
|
|
ro.async_io = true;
|
|
|
|
{
|
|
|
|
auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ro));
|
|
|
|
// First Seek
|
|
|
|
iter->Seek(BuildKey(
|
|
|
|
0)); // Prefetch data on seek because of seek parallelization.
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
|
2023-08-18 22:52:04 +00:00
|
|
|
// Do extra prefetching in Seek only if
|
|
|
|
// num_file_reads_for_auto_readahead = 0.
|
2023-06-26 17:39:44 +00:00
|
|
|
ASSERT_EQ(extra_prefetch_buff_cnt, (i == 0 ? 1 : 0));
|
|
|
|
// buff_prefetch_count is 2 because of index block when
|
|
|
|
// num_file_reads_for_auto_readahead = 0.
|
2023-08-18 22:52:04 +00:00
|
|
|
// If num_file_reads_for_auto_readahead > 0, index block isn't
|
|
|
|
// prefetched.
|
2023-06-26 17:39:44 +00:00
|
|
|
ASSERT_EQ(buff_prefetch_count, i == 0 ? 2 : 1);
|
|
|
|
|
|
|
|
extra_prefetch_buff_cnt = 0;
|
|
|
|
buff_prefetch_count = 0;
|
|
|
|
// Reset all values of FilePrefetchBuffer on new seek.
|
|
|
|
iter->Seek(
|
|
|
|
BuildKey(22)); // Prefetch data because of seek parallelization.
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
2023-08-18 22:52:04 +00:00
|
|
|
// Do extra prefetching in Seek only if
|
|
|
|
// num_file_reads_for_auto_readahead = 0.
|
2023-06-26 17:39:44 +00:00
|
|
|
ASSERT_EQ(extra_prefetch_buff_cnt, (i == 0 ? 1 : 0));
|
|
|
|
ASSERT_EQ(buff_prefetch_count, 1);
|
|
|
|
|
|
|
|
extra_prefetch_buff_cnt = 0;
|
|
|
|
buff_prefetch_count = 0;
|
|
|
|
// Reset all values of FilePrefetchBuffer on new seek.
|
|
|
|
iter->Seek(
|
|
|
|
BuildKey(33)); // Prefetch data because of seek parallelization.
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
2023-08-18 22:52:04 +00:00
|
|
|
// Do extra prefetching in Seek only if
|
|
|
|
// num_file_reads_for_auto_readahead = 0.
|
2023-06-26 17:39:44 +00:00
|
|
|
ASSERT_EQ(extra_prefetch_buff_cnt, (i == 0 ? 1 : 0));
|
|
|
|
ASSERT_EQ(buff_prefetch_count, 1);
|
|
|
|
}
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-01-20 18:17:57 +00:00
|
|
|
// This test verifies the functionality of ReadOptions.adaptive_readahead when
|
|
|
|
// reads are not sequential.
|
2022-07-06 18:42:59 +00:00
|
|
|
TEST_P(PrefetchTest1, NonSequentialReadsWithAdaptiveReadahead) {
|
2021-11-11 00:18:27 +00:00
|
|
|
const int kNumKeys = 1000;
|
|
|
|
// Set options
|
|
|
|
std::shared_ptr<MockFS> fs =
|
|
|
|
std::make_shared<MockFS>(env_->GetFileSystem(), false);
|
|
|
|
std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
|
|
|
|
|
2023-01-20 18:17:57 +00:00
|
|
|
Options options;
|
|
|
|
SetGenericOptions(env.get(), GetParam(), options);
|
2021-11-11 00:18:27 +00:00
|
|
|
BlockBasedTableOptions table_options;
|
2023-01-20 18:17:57 +00:00
|
|
|
SetBlockBasedTableOptions(table_options);
|
2021-11-11 00:18:27 +00:00
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
|
|
|
|
Status s = TryReopen(options);
|
|
|
|
if (GetParam() && (s.IsNotSupported() || s.IsInvalidArgument())) {
|
|
|
|
// If direct IO is not supported, skip the test
|
|
|
|
return;
|
|
|
|
} else {
|
|
|
|
ASSERT_OK(s);
|
|
|
|
}
|
|
|
|
|
|
|
|
WriteBatch batch;
|
|
|
|
Random rnd(309);
|
|
|
|
for (int j = 0; j < 5; j++) {
|
|
|
|
for (int i = j * kNumKeys; i < (j + 1) * kNumKeys; i++) {
|
|
|
|
ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000)));
|
|
|
|
}
|
|
|
|
ASSERT_OK(db_->Write(WriteOptions(), &batch));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
MoveFilesToLevel(2);
|
|
|
|
|
|
|
|
int buff_prefetch_count = 0;
|
|
|
|
int set_readahead = 0;
|
|
|
|
size_t readahead_size = 0;
|
|
|
|
|
2022-05-20 23:09:33 +00:00
|
|
|
SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start",
|
|
|
|
[&](void*) { buff_prefetch_count++; });
|
2021-11-11 00:18:27 +00:00
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"BlockPrefetcher::SetReadaheadState",
|
|
|
|
[&](void* /*arg*/) { set_readahead++; });
|
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"FilePrefetchBuffer::TryReadFromCache",
|
Prefer static_cast in place of most reinterpret_cast (#12308)
Summary:
The following are risks associated with pointer-to-pointer reinterpret_cast:
* Can produce the "wrong result" (crash or memory corruption). IIRC, in theory this can happen for any up-cast or down-cast for a non-standard-layout type, though in practice would only happen for multiple inheritance cases (where the base class pointer might be "inside" the derived object). We don't use multiple inheritance a lot, but we do.
* Can mask useful compiler errors upon code change, including converting between unrelated pointer types that you are expecting to be related, and converting between pointer and scalar types unintentionally.
I can only think of some obscure cases where static_cast could be troublesome when it compiles as a replacement:
* Going through `void*` could plausibly cause unnecessary or broken pointer arithmetic. Suppose we have
`struct Derived: public Base1, public Base2`. If we have `Derived*` -> `void*` -> `Base2*` -> `Derived*` through reinterpret casts, this could plausibly work (though technical UB) assuming the `Base2*` is not dereferenced. Changing to static cast could introduce breaking pointer arithmetic.
* Unnecessary (but safe) pointer arithmetic could arise in a case like `Derived*` -> `Base2*` -> `Derived*` where before the Base2 pointer might not have been dereferenced. This could potentially affect performance.
With some light scripting, I tried replacing pointer-to-pointer reinterpret_casts with static_cast and kept the cases that still compile. Most occurrences of reinterpret_cast have successfully been changed (except for java/ and third-party/). 294 changed, 257 remain.
A couple of related interventions included here:
* Previously Cache::Handle was not actually derived from in the implementations and just used as a `void*` stand-in with reinterpret_cast. Now there is a relationship to allow static_cast. In theory, this could introduce pointer arithmetic (as described above) but is unlikely without multiple inheritance AND non-empty Cache::Handle.
* Remove some unnecessary casts to void* as this is allowed to be implicit (for better or worse).
Most of the remaining reinterpret_casts are for converting to/from raw bytes of objects. We could consider better idioms for these patterns in follow-up work.
I wish there were a way to implement a template variant of static_cast that would only compile if no pointer arithmetic is generated, but best I can tell, this is not possible. AFAIK the best you could do is a dynamic check that the void* conversion after the static cast is unchanged.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12308
Test Plan: existing tests, CI
Reviewed By: ltamasi
Differential Revision: D53204947
Pulled By: pdillinger
fbshipit-source-id: 9de23e618263b0d5b9820f4e15966876888a16e2
2024-02-07 18:44:11 +00:00
|
|
|
[&](void* arg) { readahead_size = *static_cast<size_t*>(arg); });
|
2021-11-11 00:18:27 +00:00
|
|
|
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
{
|
|
|
|
// Iterate until prefetch is done.
|
|
|
|
ReadOptions ro;
|
|
|
|
ro.adaptive_readahead = true;
|
|
|
|
auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ro));
|
2022-05-20 23:09:33 +00:00
|
|
|
|
2021-11-11 00:18:27 +00:00
|
|
|
iter->SeekToFirst();
|
2022-05-20 23:09:33 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
|
2021-11-11 00:18:27 +00:00
|
|
|
while (iter->Valid() && buff_prefetch_count == 0) {
|
|
|
|
iter->Next();
|
|
|
|
}
|
2022-05-20 23:09:33 +00:00
|
|
|
|
2021-11-11 00:18:27 +00:00
|
|
|
ASSERT_EQ(readahead_size, 8 * 1024);
|
|
|
|
ASSERT_EQ(buff_prefetch_count, 1);
|
|
|
|
ASSERT_EQ(set_readahead, 0);
|
|
|
|
buff_prefetch_count = 0;
|
|
|
|
|
|
|
|
// Move to last file and check readahead size fallbacks to 8KB. So next
|
|
|
|
// readahead size after prefetch should be 8 * 1024;
|
|
|
|
iter->Seek(BuildKey(4004));
|
2022-05-20 23:09:33 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
|
2021-11-11 00:18:27 +00:00
|
|
|
while (iter->Valid() && buff_prefetch_count == 0) {
|
|
|
|
iter->Next();
|
|
|
|
}
|
2022-05-20 23:09:33 +00:00
|
|
|
|
2021-11-11 00:18:27 +00:00
|
|
|
ASSERT_EQ(readahead_size, 8 * 1024);
|
|
|
|
ASSERT_EQ(set_readahead, 0);
|
|
|
|
ASSERT_EQ(buff_prefetch_count, 1);
|
|
|
|
}
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
2023-08-18 22:52:04 +00:00
|
|
|
// This test verifies the functionality of adaptive_readaheadsize with cache
|
|
|
|
// and if block is found in cache, decrease the readahead_size if
|
2023-01-20 18:17:57 +00:00
|
|
|
// - its enabled internally by RocksDB (implicit_auto_readahead_) and,
|
|
|
|
// - readahead_size is greater than 0 and,
|
|
|
|
// - the block would have called prefetch API if not found in cache for
|
|
|
|
// which conditions are:
|
|
|
|
// - few/no bytes are in buffer and,
|
|
|
|
// - block is sequential with the previous read and,
|
|
|
|
// - num_file_reads_ + 1 (including this read) >
|
|
|
|
// num_file_reads_for_auto_readahead_
|
2022-07-06 18:42:59 +00:00
|
|
|
TEST_P(PrefetchTest1, DecreaseReadAheadIfInCache) {
|
2021-11-11 00:18:27 +00:00
|
|
|
const int kNumKeys = 2000;
|
|
|
|
// Set options
|
|
|
|
std::shared_ptr<MockFS> fs =
|
|
|
|
std::make_shared<MockFS>(env_->GetFileSystem(), false);
|
|
|
|
std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
|
|
|
|
|
2023-01-20 18:17:57 +00:00
|
|
|
Options options;
|
|
|
|
SetGenericOptions(env.get(), GetParam(), options);
|
2022-04-06 21:26:53 +00:00
|
|
|
options.statistics = CreateDBStatistics();
|
2021-11-11 00:18:27 +00:00
|
|
|
BlockBasedTableOptions table_options;
|
2023-01-20 18:17:57 +00:00
|
|
|
SetBlockBasedTableOptions(table_options);
|
2021-11-11 00:18:27 +00:00
|
|
|
std::shared_ptr<Cache> cache = NewLRUCache(4 * 1024 * 1024, 2); // 8MB
|
|
|
|
table_options.block_cache = cache;
|
2023-01-20 18:17:57 +00:00
|
|
|
table_options.no_block_cache = false;
|
2021-11-11 00:18:27 +00:00
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
|
|
|
|
Status s = TryReopen(options);
|
|
|
|
if (GetParam() && (s.IsNotSupported() || s.IsInvalidArgument())) {
|
|
|
|
// If direct IO is not supported, skip the test
|
|
|
|
return;
|
|
|
|
} else {
|
|
|
|
ASSERT_OK(s);
|
|
|
|
}
|
|
|
|
|
|
|
|
WriteBatch batch;
|
|
|
|
Random rnd(309);
|
|
|
|
for (int i = 0; i < kNumKeys; i++) {
|
|
|
|
ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000)));
|
|
|
|
}
|
|
|
|
ASSERT_OK(db_->Write(WriteOptions(), &batch));
|
|
|
|
|
|
|
|
std::string start_key = BuildKey(0);
|
|
|
|
std::string end_key = BuildKey(kNumKeys - 1);
|
|
|
|
Slice least(start_key.data(), start_key.size());
|
|
|
|
Slice greatest(end_key.data(), end_key.size());
|
|
|
|
|
|
|
|
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest));
|
|
|
|
|
|
|
|
int buff_prefetch_count = 0;
|
|
|
|
size_t current_readahead_size = 0;
|
|
|
|
size_t expected_current_readahead_size = 8 * 1024;
|
|
|
|
size_t decrease_readahead_size = 8 * 1024;
|
|
|
|
|
2022-06-16 03:17:35 +00:00
|
|
|
SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start",
|
|
|
|
[&](void*) { buff_prefetch_count++; });
|
2021-11-11 00:18:27 +00:00
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
Prefer static_cast in place of most reinterpret_cast (#12308)
Summary:
The following are risks associated with pointer-to-pointer reinterpret_cast:
* Can produce the "wrong result" (crash or memory corruption). IIRC, in theory this can happen for any up-cast or down-cast for a non-standard-layout type, though in practice would only happen for multiple inheritance cases (where the base class pointer might be "inside" the derived object). We don't use multiple inheritance a lot, but we do.
* Can mask useful compiler errors upon code change, including converting between unrelated pointer types that you are expecting to be related, and converting between pointer and scalar types unintentionally.
I can only think of some obscure cases where static_cast could be troublesome when it compiles as a replacement:
* Going through `void*` could plausibly cause unnecessary or broken pointer arithmetic. Suppose we have
`struct Derived: public Base1, public Base2`. If we have `Derived*` -> `void*` -> `Base2*` -> `Derived*` through reinterpret casts, this could plausibly work (though technical UB) assuming the `Base2*` is not dereferenced. Changing to static cast could introduce breaking pointer arithmetic.
* Unnecessary (but safe) pointer arithmetic could arise in a case like `Derived*` -> `Base2*` -> `Derived*` where before the Base2 pointer might not have been dereferenced. This could potentially affect performance.
With some light scripting, I tried replacing pointer-to-pointer reinterpret_casts with static_cast and kept the cases that still compile. Most occurrences of reinterpret_cast have successfully been changed (except for java/ and third-party/). 294 changed, 257 remain.
A couple of related interventions included here:
* Previously Cache::Handle was not actually derived from in the implementations and just used as a `void*` stand-in with reinterpret_cast. Now there is a relationship to allow static_cast. In theory, this could introduce pointer arithmetic (as described above) but is unlikely without multiple inheritance AND non-empty Cache::Handle.
* Remove some unnecessary casts to void* as this is allowed to be implicit (for better or worse).
Most of the remaining reinterpret_casts are for converting to/from raw bytes of objects. We could consider better idioms for these patterns in follow-up work.
I wish there were a way to implement a template variant of static_cast that would only compile if no pointer arithmetic is generated, but best I can tell, this is not possible. AFAIK the best you could do is a dynamic check that the void* conversion after the static cast is unchanged.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12308
Test Plan: existing tests, CI
Reviewed By: ltamasi
Differential Revision: D53204947
Pulled By: pdillinger
fbshipit-source-id: 9de23e618263b0d5b9820f4e15966876888a16e2
2024-02-07 18:44:11 +00:00
|
|
|
"FilePrefetchBuffer::TryReadFromCache",
|
|
|
|
[&](void* arg) { current_readahead_size = *static_cast<size_t*>(arg); });
|
2021-11-11 00:18:27 +00:00
|
|
|
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
ReadOptions ro;
|
|
|
|
ro.adaptive_readahead = true;
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Reseek keys from sequential Data Blocks within same partitioned
|
|
|
|
* index. After 2 sequential reads it will prefetch the data block.
|
|
|
|
* Data Block size is nearly 4076 so readahead will fetch 8 * 1024 data
|
|
|
|
* more initially (2 more data blocks).
|
|
|
|
*/
|
|
|
|
auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ro));
|
|
|
|
// Warm up the cache
|
|
|
|
iter->Seek(BuildKey(1011));
|
2022-05-20 23:09:33 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
2021-11-11 00:18:27 +00:00
|
|
|
iter->Seek(BuildKey(1015));
|
2022-05-20 23:09:33 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
2021-11-11 00:18:27 +00:00
|
|
|
iter->Seek(BuildKey(1019));
|
2022-05-20 23:09:33 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
2021-11-11 00:18:27 +00:00
|
|
|
buff_prefetch_count = 0;
|
|
|
|
}
|
2022-04-06 21:26:53 +00:00
|
|
|
|
2021-11-11 00:18:27 +00:00
|
|
|
{
|
2022-04-06 21:26:53 +00:00
|
|
|
ASSERT_OK(options.statistics->Reset());
|
2021-11-11 00:18:27 +00:00
|
|
|
// After caching, blocks will be read from cache (Sequential blocks)
|
|
|
|
auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ro));
|
2022-05-20 23:09:33 +00:00
|
|
|
iter->Seek(
|
|
|
|
BuildKey(0)); // In cache so it will decrease the readahead_size.
|
2022-03-21 14:12:43 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
2022-05-20 23:09:33 +00:00
|
|
|
expected_current_readahead_size = std::max(
|
|
|
|
decrease_readahead_size,
|
|
|
|
(expected_current_readahead_size >= decrease_readahead_size
|
|
|
|
? (expected_current_readahead_size - decrease_readahead_size)
|
|
|
|
: 0));
|
|
|
|
|
2022-06-16 03:17:35 +00:00
|
|
|
iter->Seek(BuildKey(1000)); // Won't prefetch the block.
|
2022-03-21 14:12:43 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
2022-05-20 23:09:33 +00:00
|
|
|
ASSERT_EQ(current_readahead_size, expected_current_readahead_size);
|
|
|
|
|
|
|
|
iter->Seek(BuildKey(1004)); // Prefetch the block.
|
2022-03-21 14:12:43 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
2021-11-11 00:18:27 +00:00
|
|
|
ASSERT_EQ(current_readahead_size, expected_current_readahead_size);
|
2022-05-20 23:09:33 +00:00
|
|
|
expected_current_readahead_size *= 2;
|
2021-11-11 00:18:27 +00:00
|
|
|
|
|
|
|
iter->Seek(BuildKey(1011));
|
2022-03-21 14:12:43 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
2021-11-11 00:18:27 +00:00
|
|
|
|
|
|
|
// Eligible to Prefetch data (not in buffer) but block is in cache so no
|
|
|
|
// prefetch will happen and will result in decrease in readahead_size.
|
|
|
|
// readahead_size will be 8 * 1024
|
|
|
|
iter->Seek(BuildKey(1015));
|
2022-03-21 14:12:43 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
2022-05-20 23:09:33 +00:00
|
|
|
expected_current_readahead_size = std::max(
|
|
|
|
decrease_readahead_size,
|
|
|
|
(expected_current_readahead_size >= decrease_readahead_size
|
|
|
|
? (expected_current_readahead_size - decrease_readahead_size)
|
|
|
|
: 0));
|
2021-11-11 00:18:27 +00:00
|
|
|
|
|
|
|
// 1016 is the same block as 1015. So no change in readahead_size.
|
|
|
|
iter->Seek(BuildKey(1016));
|
2022-03-21 14:12:43 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
2021-11-11 00:18:27 +00:00
|
|
|
|
|
|
|
// Prefetch data (not in buffer) but found in cache. So decrease
|
2023-08-18 22:52:04 +00:00
|
|
|
// readahead_size. Since it will 0 after decrementing so readahead_size
|
|
|
|
// will be set to initial value.
|
2021-11-11 00:18:27 +00:00
|
|
|
iter->Seek(BuildKey(1019));
|
2022-03-21 14:12:43 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
2021-11-11 00:18:27 +00:00
|
|
|
expected_current_readahead_size = std::max(
|
|
|
|
decrease_readahead_size,
|
|
|
|
(expected_current_readahead_size >= decrease_readahead_size
|
|
|
|
? (expected_current_readahead_size - decrease_readahead_size)
|
|
|
|
: 0));
|
|
|
|
|
|
|
|
// Prefetch next sequential data.
|
|
|
|
iter->Seek(BuildKey(1022));
|
2022-03-21 14:12:43 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
2021-11-11 00:18:27 +00:00
|
|
|
ASSERT_EQ(current_readahead_size, expected_current_readahead_size);
|
2022-06-16 03:17:35 +00:00
|
|
|
ASSERT_EQ(buff_prefetch_count, 2);
|
|
|
|
|
|
|
|
buff_prefetch_count = 0;
|
|
|
|
}
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
2023-01-20 18:17:57 +00:00
|
|
|
// This test verifies the basic functionality of seek parallelization for
|
|
|
|
// async_io.
|
2022-07-06 18:42:59 +00:00
|
|
|
TEST_P(PrefetchTest1, SeekParallelizationTest) {
|
2023-06-23 18:48:49 +00:00
|
|
|
if (mem_env_ || encrypted_env_) {
|
|
|
|
ROCKSDB_GTEST_BYPASS("Test requires non-mem or non-encrypted environment");
|
|
|
|
return;
|
|
|
|
}
|
2022-06-16 03:17:35 +00:00
|
|
|
const int kNumKeys = 2000;
|
|
|
|
// Set options
|
2023-06-23 18:48:49 +00:00
|
|
|
std::shared_ptr<MockFS> fs = std::make_shared<MockFS>(
|
|
|
|
FileSystem::Default(), /*support_prefetch=*/false);
|
2022-06-16 03:17:35 +00:00
|
|
|
std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
|
|
|
|
|
2023-01-20 18:17:57 +00:00
|
|
|
Options options;
|
|
|
|
SetGenericOptions(env.get(), GetParam(), options);
|
2022-06-16 03:17:35 +00:00
|
|
|
options.statistics = CreateDBStatistics();
|
|
|
|
BlockBasedTableOptions table_options;
|
2023-01-20 18:17:57 +00:00
|
|
|
SetBlockBasedTableOptions(table_options);
|
2022-06-16 03:17:35 +00:00
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
|
|
|
|
Status s = TryReopen(options);
|
|
|
|
if (GetParam() && (s.IsNotSupported() || s.IsInvalidArgument())) {
|
|
|
|
// If direct IO is not supported, skip the test
|
|
|
|
return;
|
|
|
|
} else {
|
|
|
|
ASSERT_OK(s);
|
|
|
|
}
|
|
|
|
|
|
|
|
WriteBatch batch;
|
|
|
|
Random rnd(309);
|
|
|
|
for (int i = 0; i < kNumKeys; i++) {
|
|
|
|
ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000)));
|
|
|
|
}
|
|
|
|
ASSERT_OK(db_->Write(WriteOptions(), &batch));
|
|
|
|
|
|
|
|
std::string start_key = BuildKey(0);
|
|
|
|
std::string end_key = BuildKey(kNumKeys - 1);
|
|
|
|
Slice least(start_key.data(), start_key.size());
|
|
|
|
Slice greatest(end_key.data(), end_key.size());
|
|
|
|
|
|
|
|
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest));
|
|
|
|
|
|
|
|
int buff_prefetch_count = 0;
|
2023-06-23 18:48:49 +00:00
|
|
|
SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start",
|
|
|
|
[&](void*) { buff_prefetch_count++; });
|
|
|
|
|
|
|
|
bool read_async_called = false;
|
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"UpdateResults::io_uring_result",
|
|
|
|
[&](void* /*arg*/) { read_async_called = true; });
|
2022-06-16 03:17:35 +00:00
|
|
|
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
ReadOptions ro;
|
|
|
|
ro.adaptive_readahead = true;
|
|
|
|
ro.async_io = true;
|
|
|
|
|
|
|
|
{
|
|
|
|
ASSERT_OK(options.statistics->Reset());
|
|
|
|
// Each block contains around 4 keys.
|
|
|
|
auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ro));
|
|
|
|
iter->Seek(BuildKey(0)); // Prefetch data because of seek parallelization.
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
|
|
|
|
// New data block. Since num_file_reads in FilePrefetch after this read is
|
|
|
|
// 2, it won't go for prefetching.
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
|
|
|
|
// Prefetch data.
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
|
2023-06-23 18:48:49 +00:00
|
|
|
HistogramData async_read_bytes;
|
|
|
|
options.statistics->histogramData(ASYNC_READ_BYTES, &async_read_bytes);
|
2023-08-18 22:52:04 +00:00
|
|
|
// not all platforms support io_uring. In that case it'll fallback to
|
|
|
|
// normal prefetching without async_io.
|
2023-06-23 18:48:49 +00:00
|
|
|
if (read_async_called) {
|
2024-01-05 17:29:01 +00:00
|
|
|
ASSERT_EQ(buff_prefetch_count, 2);
|
2022-07-06 18:42:59 +00:00
|
|
|
ASSERT_GT(async_read_bytes.count, 0);
|
|
|
|
ASSERT_GT(get_perf_context()->number_async_seek, 0);
|
2023-06-23 18:48:49 +00:00
|
|
|
} else {
|
|
|
|
ASSERT_EQ(buff_prefetch_count, 1);
|
2022-04-06 21:26:53 +00:00
|
|
|
}
|
2021-11-11 00:18:27 +00:00
|
|
|
}
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
2022-07-06 18:42:59 +00:00
|
|
|
namespace {
|
Provide support for IOTracing for ReadAsync API (#9833)
Summary:
Same as title
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9833
Test Plan:
Add unit test and manually check the output of tracing logs
For fixed readahead_size it logs as:
```
Access Time : 193352113447923 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 15075 , IO Status: OK, Length: 12288, Offset: 659456
Access Time : 193352113465232 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 14425 , IO Status: OK, Length: 12288, Offset: 671744
Access Time : 193352113481539 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13062 , IO Status: OK, Length: 12288, Offset: 684032
Access Time : 193352113497692 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13649 , IO Status: OK, Length: 12288, Offset: 696320
Access Time : 193352113520043 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 19384 , IO Status: OK, Length: 12288, Offset: 708608
Access Time : 193352113538401 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 15406 , IO Status: OK, Length: 12288, Offset: 720896
Access Time : 193352113554855 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13670 , IO Status: OK, Length: 12288, Offset: 733184
Access Time : 193352113571624 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13855 , IO Status: OK, Length: 12288, Offset: 745472
Access Time : 193352113587924 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13953 , IO Status: OK, Length: 12288, Offset: 757760
Access Time : 193352113603285 , File Name: 000026.sst , File Operation: Prefetch , Latency: 59 , IO Status: Not implemented: Prefetch not supported, Length: 8868, Offset: 898349
```
For implicit readahead:
```
Access Time : 193351865156587 , File Name: 000026.sst , File Operation: Prefetch , Latency: 48 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 391174
Access Time : 193351865160354 , File Name: 000026.sst , File Operation: Prefetch , Latency: 51 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 395248
Access Time : 193351865164253 , File Name: 000026.sst , File Operation: Prefetch , Latency: 49 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 399322
Access Time : 193351865165461 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 222871 , IO Status: OK, Length: 135168, Offset: 401408
```
Reviewed By: anand1976
Differential Revision: D35601634
Pulled By: akankshamahajan15
fbshipit-source-id: 5a4f32a850af878efa0767bd5706380152a1f26e
2022-05-26 02:47:03 +00:00
|
|
|
#ifdef GFLAGS
|
2022-10-25 01:34:52 +00:00
|
|
|
const int kMaxArgCount = 100;
|
|
|
|
const size_t kArgBufferSize = 100000;
|
Provide support for IOTracing for ReadAsync API (#9833)
Summary:
Same as title
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9833
Test Plan:
Add unit test and manually check the output of tracing logs
For fixed readahead_size it logs as:
```
Access Time : 193352113447923 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 15075 , IO Status: OK, Length: 12288, Offset: 659456
Access Time : 193352113465232 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 14425 , IO Status: OK, Length: 12288, Offset: 671744
Access Time : 193352113481539 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13062 , IO Status: OK, Length: 12288, Offset: 684032
Access Time : 193352113497692 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13649 , IO Status: OK, Length: 12288, Offset: 696320
Access Time : 193352113520043 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 19384 , IO Status: OK, Length: 12288, Offset: 708608
Access Time : 193352113538401 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 15406 , IO Status: OK, Length: 12288, Offset: 720896
Access Time : 193352113554855 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13670 , IO Status: OK, Length: 12288, Offset: 733184
Access Time : 193352113571624 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13855 , IO Status: OK, Length: 12288, Offset: 745472
Access Time : 193352113587924 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13953 , IO Status: OK, Length: 12288, Offset: 757760
Access Time : 193352113603285 , File Name: 000026.sst , File Operation: Prefetch , Latency: 59 , IO Status: Not implemented: Prefetch not supported, Length: 8868, Offset: 898349
```
For implicit readahead:
```
Access Time : 193351865156587 , File Name: 000026.sst , File Operation: Prefetch , Latency: 48 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 391174
Access Time : 193351865160354 , File Name: 000026.sst , File Operation: Prefetch , Latency: 51 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 395248
Access Time : 193351865164253 , File Name: 000026.sst , File Operation: Prefetch , Latency: 49 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 399322
Access Time : 193351865165461 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 222871 , IO Status: OK, Length: 135168, Offset: 401408
```
Reviewed By: anand1976
Differential Revision: D35601634
Pulled By: akankshamahajan15
fbshipit-source-id: 5a4f32a850af878efa0767bd5706380152a1f26e
2022-05-26 02:47:03 +00:00
|
|
|
|
2022-10-25 01:34:52 +00:00
|
|
|
void RunIOTracerParserTool(std::string trace_file) {
|
|
|
|
std::vector<std::string> params = {"./io_tracer_parser",
|
|
|
|
"-io_trace_file=" + trace_file};
|
Provide support for IOTracing for ReadAsync API (#9833)
Summary:
Same as title
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9833
Test Plan:
Add unit test and manually check the output of tracing logs
For fixed readahead_size it logs as:
```
Access Time : 193352113447923 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 15075 , IO Status: OK, Length: 12288, Offset: 659456
Access Time : 193352113465232 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 14425 , IO Status: OK, Length: 12288, Offset: 671744
Access Time : 193352113481539 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13062 , IO Status: OK, Length: 12288, Offset: 684032
Access Time : 193352113497692 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13649 , IO Status: OK, Length: 12288, Offset: 696320
Access Time : 193352113520043 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 19384 , IO Status: OK, Length: 12288, Offset: 708608
Access Time : 193352113538401 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 15406 , IO Status: OK, Length: 12288, Offset: 720896
Access Time : 193352113554855 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13670 , IO Status: OK, Length: 12288, Offset: 733184
Access Time : 193352113571624 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13855 , IO Status: OK, Length: 12288, Offset: 745472
Access Time : 193352113587924 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13953 , IO Status: OK, Length: 12288, Offset: 757760
Access Time : 193352113603285 , File Name: 000026.sst , File Operation: Prefetch , Latency: 59 , IO Status: Not implemented: Prefetch not supported, Length: 8868, Offset: 898349
```
For implicit readahead:
```
Access Time : 193351865156587 , File Name: 000026.sst , File Operation: Prefetch , Latency: 48 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 391174
Access Time : 193351865160354 , File Name: 000026.sst , File Operation: Prefetch , Latency: 51 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 395248
Access Time : 193351865164253 , File Name: 000026.sst , File Operation: Prefetch , Latency: 49 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 399322
Access Time : 193351865165461 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 222871 , IO Status: OK, Length: 135168, Offset: 401408
```
Reviewed By: anand1976
Differential Revision: D35601634
Pulled By: akankshamahajan15
fbshipit-source-id: 5a4f32a850af878efa0767bd5706380152a1f26e
2022-05-26 02:47:03 +00:00
|
|
|
|
2022-10-25 01:34:52 +00:00
|
|
|
char arg_buffer[kArgBufferSize];
|
|
|
|
char* argv[kMaxArgCount];
|
|
|
|
int argc = 0;
|
|
|
|
int cursor = 0;
|
|
|
|
for (const auto& arg : params) {
|
|
|
|
ASSERT_LE(cursor + arg.size() + 1, kArgBufferSize);
|
|
|
|
ASSERT_LE(argc + 1, kMaxArgCount);
|
Provide support for IOTracing for ReadAsync API (#9833)
Summary:
Same as title
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9833
Test Plan:
Add unit test and manually check the output of tracing logs
For fixed readahead_size it logs as:
```
Access Time : 193352113447923 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 15075 , IO Status: OK, Length: 12288, Offset: 659456
Access Time : 193352113465232 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 14425 , IO Status: OK, Length: 12288, Offset: 671744
Access Time : 193352113481539 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13062 , IO Status: OK, Length: 12288, Offset: 684032
Access Time : 193352113497692 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13649 , IO Status: OK, Length: 12288, Offset: 696320
Access Time : 193352113520043 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 19384 , IO Status: OK, Length: 12288, Offset: 708608
Access Time : 193352113538401 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 15406 , IO Status: OK, Length: 12288, Offset: 720896
Access Time : 193352113554855 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13670 , IO Status: OK, Length: 12288, Offset: 733184
Access Time : 193352113571624 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13855 , IO Status: OK, Length: 12288, Offset: 745472
Access Time : 193352113587924 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13953 , IO Status: OK, Length: 12288, Offset: 757760
Access Time : 193352113603285 , File Name: 000026.sst , File Operation: Prefetch , Latency: 59 , IO Status: Not implemented: Prefetch not supported, Length: 8868, Offset: 898349
```
For implicit readahead:
```
Access Time : 193351865156587 , File Name: 000026.sst , File Operation: Prefetch , Latency: 48 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 391174
Access Time : 193351865160354 , File Name: 000026.sst , File Operation: Prefetch , Latency: 51 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 395248
Access Time : 193351865164253 , File Name: 000026.sst , File Operation: Prefetch , Latency: 49 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 399322
Access Time : 193351865165461 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 222871 , IO Status: OK, Length: 135168, Offset: 401408
```
Reviewed By: anand1976
Differential Revision: D35601634
Pulled By: akankshamahajan15
fbshipit-source-id: 5a4f32a850af878efa0767bd5706380152a1f26e
2022-05-26 02:47:03 +00:00
|
|
|
|
2022-10-25 01:34:52 +00:00
|
|
|
snprintf(arg_buffer + cursor, arg.size() + 1, "%s", arg.c_str());
|
Provide support for IOTracing for ReadAsync API (#9833)
Summary:
Same as title
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9833
Test Plan:
Add unit test and manually check the output of tracing logs
For fixed readahead_size it logs as:
```
Access Time : 193352113447923 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 15075 , IO Status: OK, Length: 12288, Offset: 659456
Access Time : 193352113465232 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 14425 , IO Status: OK, Length: 12288, Offset: 671744
Access Time : 193352113481539 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13062 , IO Status: OK, Length: 12288, Offset: 684032
Access Time : 193352113497692 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13649 , IO Status: OK, Length: 12288, Offset: 696320
Access Time : 193352113520043 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 19384 , IO Status: OK, Length: 12288, Offset: 708608
Access Time : 193352113538401 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 15406 , IO Status: OK, Length: 12288, Offset: 720896
Access Time : 193352113554855 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13670 , IO Status: OK, Length: 12288, Offset: 733184
Access Time : 193352113571624 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13855 , IO Status: OK, Length: 12288, Offset: 745472
Access Time : 193352113587924 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13953 , IO Status: OK, Length: 12288, Offset: 757760
Access Time : 193352113603285 , File Name: 000026.sst , File Operation: Prefetch , Latency: 59 , IO Status: Not implemented: Prefetch not supported, Length: 8868, Offset: 898349
```
For implicit readahead:
```
Access Time : 193351865156587 , File Name: 000026.sst , File Operation: Prefetch , Latency: 48 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 391174
Access Time : 193351865160354 , File Name: 000026.sst , File Operation: Prefetch , Latency: 51 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 395248
Access Time : 193351865164253 , File Name: 000026.sst , File Operation: Prefetch , Latency: 49 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 399322
Access Time : 193351865165461 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 222871 , IO Status: OK, Length: 135168, Offset: 401408
```
Reviewed By: anand1976
Differential Revision: D35601634
Pulled By: akankshamahajan15
fbshipit-source-id: 5a4f32a850af878efa0767bd5706380152a1f26e
2022-05-26 02:47:03 +00:00
|
|
|
|
2022-10-25 01:34:52 +00:00
|
|
|
argv[argc++] = arg_buffer + cursor;
|
|
|
|
cursor += static_cast<int>(arg.size()) + 1;
|
Provide support for IOTracing for ReadAsync API (#9833)
Summary:
Same as title
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9833
Test Plan:
Add unit test and manually check the output of tracing logs
For fixed readahead_size it logs as:
```
Access Time : 193352113447923 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 15075 , IO Status: OK, Length: 12288, Offset: 659456
Access Time : 193352113465232 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 14425 , IO Status: OK, Length: 12288, Offset: 671744
Access Time : 193352113481539 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13062 , IO Status: OK, Length: 12288, Offset: 684032
Access Time : 193352113497692 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13649 , IO Status: OK, Length: 12288, Offset: 696320
Access Time : 193352113520043 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 19384 , IO Status: OK, Length: 12288, Offset: 708608
Access Time : 193352113538401 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 15406 , IO Status: OK, Length: 12288, Offset: 720896
Access Time : 193352113554855 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13670 , IO Status: OK, Length: 12288, Offset: 733184
Access Time : 193352113571624 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13855 , IO Status: OK, Length: 12288, Offset: 745472
Access Time : 193352113587924 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13953 , IO Status: OK, Length: 12288, Offset: 757760
Access Time : 193352113603285 , File Name: 000026.sst , File Operation: Prefetch , Latency: 59 , IO Status: Not implemented: Prefetch not supported, Length: 8868, Offset: 898349
```
For implicit readahead:
```
Access Time : 193351865156587 , File Name: 000026.sst , File Operation: Prefetch , Latency: 48 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 391174
Access Time : 193351865160354 , File Name: 000026.sst , File Operation: Prefetch , Latency: 51 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 395248
Access Time : 193351865164253 , File Name: 000026.sst , File Operation: Prefetch , Latency: 49 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 399322
Access Time : 193351865165461 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 222871 , IO Status: OK, Length: 135168, Offset: 401408
```
Reviewed By: anand1976
Differential Revision: D35601634
Pulled By: akankshamahajan15
fbshipit-source-id: 5a4f32a850af878efa0767bd5706380152a1f26e
2022-05-26 02:47:03 +00:00
|
|
|
}
|
2022-10-25 01:34:52 +00:00
|
|
|
ASSERT_EQ(0, ROCKSDB_NAMESPACE::io_tracer_parser(argc, argv));
|
|
|
|
}
|
Provide support for IOTracing for ReadAsync API (#9833)
Summary:
Same as title
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9833
Test Plan:
Add unit test and manually check the output of tracing logs
For fixed readahead_size it logs as:
```
Access Time : 193352113447923 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 15075 , IO Status: OK, Length: 12288, Offset: 659456
Access Time : 193352113465232 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 14425 , IO Status: OK, Length: 12288, Offset: 671744
Access Time : 193352113481539 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13062 , IO Status: OK, Length: 12288, Offset: 684032
Access Time : 193352113497692 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13649 , IO Status: OK, Length: 12288, Offset: 696320
Access Time : 193352113520043 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 19384 , IO Status: OK, Length: 12288, Offset: 708608
Access Time : 193352113538401 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 15406 , IO Status: OK, Length: 12288, Offset: 720896
Access Time : 193352113554855 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13670 , IO Status: OK, Length: 12288, Offset: 733184
Access Time : 193352113571624 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13855 , IO Status: OK, Length: 12288, Offset: 745472
Access Time : 193352113587924 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13953 , IO Status: OK, Length: 12288, Offset: 757760
Access Time : 193352113603285 , File Name: 000026.sst , File Operation: Prefetch , Latency: 59 , IO Status: Not implemented: Prefetch not supported, Length: 8868, Offset: 898349
```
For implicit readahead:
```
Access Time : 193351865156587 , File Name: 000026.sst , File Operation: Prefetch , Latency: 48 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 391174
Access Time : 193351865160354 , File Name: 000026.sst , File Operation: Prefetch , Latency: 51 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 395248
Access Time : 193351865164253 , File Name: 000026.sst , File Operation: Prefetch , Latency: 49 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 399322
Access Time : 193351865165461 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 222871 , IO Status: OK, Length: 135168, Offset: 401408
```
Reviewed By: anand1976
Differential Revision: D35601634
Pulled By: akankshamahajan15
fbshipit-source-id: 5a4f32a850af878efa0767bd5706380152a1f26e
2022-05-26 02:47:03 +00:00
|
|
|
#endif // GFLAGS
|
2022-10-25 01:34:52 +00:00
|
|
|
} // namespace
|
2022-04-11 20:46:57 +00:00
|
|
|
|
2023-01-20 18:17:57 +00:00
|
|
|
// Tests the default implementation of ReadAsync API with PosixFileSystem during
|
|
|
|
// prefetching.
|
2022-10-25 01:34:52 +00:00
|
|
|
TEST_P(PrefetchTest, ReadAsyncWithPosixFS) {
|
|
|
|
if (mem_env_ || encrypted_env_) {
|
|
|
|
ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
|
|
|
|
return;
|
|
|
|
}
|
2022-04-04 22:35:43 +00:00
|
|
|
|
2022-10-25 01:34:52 +00:00
|
|
|
const int kNumKeys = 1000;
|
|
|
|
std::shared_ptr<MockFS> fs = std::make_shared<MockFS>(
|
|
|
|
FileSystem::Default(), /*support_prefetch=*/false);
|
|
|
|
std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
|
2022-04-04 22:35:43 +00:00
|
|
|
|
2022-10-25 01:34:52 +00:00
|
|
|
bool use_direct_io = std::get<0>(GetParam());
|
2023-01-20 18:17:57 +00:00
|
|
|
Options options;
|
|
|
|
SetGenericOptions(env.get(), use_direct_io, options);
|
2022-10-25 01:34:52 +00:00
|
|
|
options.statistics = CreateDBStatistics();
|
|
|
|
BlockBasedTableOptions table_options;
|
2023-01-20 18:17:57 +00:00
|
|
|
SetBlockBasedTableOptions(table_options);
|
2022-10-25 01:34:52 +00:00
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
|
|
|
|
Status s = TryReopen(options);
|
|
|
|
if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) {
|
|
|
|
// If direct IO is not supported, skip the test
|
|
|
|
return;
|
|
|
|
} else {
|
|
|
|
ASSERT_OK(s);
|
|
|
|
}
|
|
|
|
|
|
|
|
int total_keys = 0;
|
|
|
|
// Write the keys.
|
|
|
|
{
|
|
|
|
WriteBatch batch;
|
|
|
|
Random rnd(309);
|
|
|
|
for (int j = 0; j < 5; j++) {
|
|
|
|
for (int i = j * kNumKeys; i < (j + 1) * kNumKeys; i++) {
|
|
|
|
ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000)));
|
|
|
|
total_keys++;
|
2022-04-04 22:35:43 +00:00
|
|
|
}
|
2022-10-25 01:34:52 +00:00
|
|
|
ASSERT_OK(db_->Write(WriteOptions(), &batch));
|
|
|
|
ASSERT_OK(Flush());
|
2022-04-04 22:35:43 +00:00
|
|
|
}
|
2022-10-25 01:34:52 +00:00
|
|
|
MoveFilesToLevel(2);
|
|
|
|
}
|
2022-04-11 20:46:57 +00:00
|
|
|
|
2022-10-25 01:34:52 +00:00
|
|
|
int buff_prefetch_count = 0;
|
|
|
|
bool read_async_called = false;
|
|
|
|
ReadOptions ro;
|
|
|
|
ro.adaptive_readahead = true;
|
|
|
|
ro.async_io = true;
|
|
|
|
|
|
|
|
if (std::get<1>(GetParam())) {
|
|
|
|
ro.readahead_size = 16 * 1024;
|
|
|
|
}
|
|
|
|
|
2024-01-05 17:29:01 +00:00
|
|
|
SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start",
|
|
|
|
[&](void*) { buff_prefetch_count++; });
|
2022-10-25 01:34:52 +00:00
|
|
|
|
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"UpdateResults::io_uring_result",
|
|
|
|
[&](void* /*arg*/) { read_async_called = true; });
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
// Read the keys.
|
|
|
|
{
|
|
|
|
ASSERT_OK(options.statistics->Reset());
|
|
|
|
get_perf_context()->Reset();
|
2022-05-23 19:15:26 +00:00
|
|
|
|
2022-10-25 01:34:52 +00:00
|
|
|
auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ro));
|
|
|
|
int num_keys = 0;
|
|
|
|
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
num_keys++;
|
2022-04-04 22:35:43 +00:00
|
|
|
}
|
2023-10-18 16:38:38 +00:00
|
|
|
ASSERT_OK(iter->status());
|
2022-05-23 19:15:26 +00:00
|
|
|
|
2023-02-17 02:33:06 +00:00
|
|
|
if (read_async_called) {
|
|
|
|
ASSERT_EQ(num_keys, total_keys);
|
|
|
|
// Check stats to make sure async prefetch is done.
|
2022-10-25 01:34:52 +00:00
|
|
|
HistogramData async_read_bytes;
|
|
|
|
options.statistics->histogramData(ASYNC_READ_BYTES, &async_read_bytes);
|
|
|
|
HistogramData prefetched_bytes_discarded;
|
|
|
|
options.statistics->histogramData(PREFETCHED_BYTES_DISCARDED,
|
|
|
|
&prefetched_bytes_discarded);
|
2023-02-17 02:33:06 +00:00
|
|
|
ASSERT_GT(async_read_bytes.count, 0);
|
|
|
|
ASSERT_GT(prefetched_bytes_discarded.count, 0);
|
|
|
|
ASSERT_EQ(get_perf_context()->number_async_seek, 0);
|
|
|
|
} else {
|
2022-10-25 01:34:52 +00:00
|
|
|
// Not all platforms support iouring. In that case, ReadAsync in posix
|
|
|
|
// won't submit async requests.
|
2023-03-17 21:57:09 +00:00
|
|
|
ASSERT_EQ(num_keys, total_keys);
|
2022-10-25 01:34:52 +00:00
|
|
|
}
|
2024-01-05 17:29:01 +00:00
|
|
|
ASSERT_GT(buff_prefetch_count, 0);
|
2022-10-25 01:34:52 +00:00
|
|
|
}
|
2022-05-23 19:15:26 +00:00
|
|
|
|
2022-10-25 01:34:52 +00:00
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
SyncPoint::GetInstance()->ClearAllCallBacks();
|
2022-04-04 22:35:43 +00:00
|
|
|
|
2022-10-25 01:34:52 +00:00
|
|
|
Close();
|
|
|
|
}
|
Provide support for IOTracing for ReadAsync API (#9833)
Summary:
Same as title
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9833
Test Plan:
Add unit test and manually check the output of tracing logs
For fixed readahead_size it logs as:
```
Access Time : 193352113447923 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 15075 , IO Status: OK, Length: 12288, Offset: 659456
Access Time : 193352113465232 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 14425 , IO Status: OK, Length: 12288, Offset: 671744
Access Time : 193352113481539 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13062 , IO Status: OK, Length: 12288, Offset: 684032
Access Time : 193352113497692 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13649 , IO Status: OK, Length: 12288, Offset: 696320
Access Time : 193352113520043 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 19384 , IO Status: OK, Length: 12288, Offset: 708608
Access Time : 193352113538401 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 15406 , IO Status: OK, Length: 12288, Offset: 720896
Access Time : 193352113554855 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13670 , IO Status: OK, Length: 12288, Offset: 733184
Access Time : 193352113571624 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13855 , IO Status: OK, Length: 12288, Offset: 745472
Access Time : 193352113587924 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13953 , IO Status: OK, Length: 12288, Offset: 757760
Access Time : 193352113603285 , File Name: 000026.sst , File Operation: Prefetch , Latency: 59 , IO Status: Not implemented: Prefetch not supported, Length: 8868, Offset: 898349
```
For implicit readahead:
```
Access Time : 193351865156587 , File Name: 000026.sst , File Operation: Prefetch , Latency: 48 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 391174
Access Time : 193351865160354 , File Name: 000026.sst , File Operation: Prefetch , Latency: 51 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 395248
Access Time : 193351865164253 , File Name: 000026.sst , File Operation: Prefetch , Latency: 49 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 399322
Access Time : 193351865165461 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 222871 , IO Status: OK, Length: 135168, Offset: 401408
```
Reviewed By: anand1976
Differential Revision: D35601634
Pulled By: akankshamahajan15
fbshipit-source-id: 5a4f32a850af878efa0767bd5706380152a1f26e
2022-05-26 02:47:03 +00:00
|
|
|
|
2023-01-20 18:17:57 +00:00
|
|
|
// This test verifies implementation of seek parallelization with
|
|
|
|
// PosixFileSystem during prefetching.
|
2022-10-25 01:34:52 +00:00
|
|
|
TEST_P(PrefetchTest, MultipleSeekWithPosixFS) {
|
|
|
|
if (mem_env_ || encrypted_env_) {
|
|
|
|
ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
|
|
|
|
return;
|
|
|
|
}
|
Provide support for IOTracing for ReadAsync API (#9833)
Summary:
Same as title
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9833
Test Plan:
Add unit test and manually check the output of tracing logs
For fixed readahead_size it logs as:
```
Access Time : 193352113447923 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 15075 , IO Status: OK, Length: 12288, Offset: 659456
Access Time : 193352113465232 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 14425 , IO Status: OK, Length: 12288, Offset: 671744
Access Time : 193352113481539 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13062 , IO Status: OK, Length: 12288, Offset: 684032
Access Time : 193352113497692 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13649 , IO Status: OK, Length: 12288, Offset: 696320
Access Time : 193352113520043 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 19384 , IO Status: OK, Length: 12288, Offset: 708608
Access Time : 193352113538401 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 15406 , IO Status: OK, Length: 12288, Offset: 720896
Access Time : 193352113554855 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13670 , IO Status: OK, Length: 12288, Offset: 733184
Access Time : 193352113571624 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13855 , IO Status: OK, Length: 12288, Offset: 745472
Access Time : 193352113587924 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13953 , IO Status: OK, Length: 12288, Offset: 757760
Access Time : 193352113603285 , File Name: 000026.sst , File Operation: Prefetch , Latency: 59 , IO Status: Not implemented: Prefetch not supported, Length: 8868, Offset: 898349
```
For implicit readahead:
```
Access Time : 193351865156587 , File Name: 000026.sst , File Operation: Prefetch , Latency: 48 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 391174
Access Time : 193351865160354 , File Name: 000026.sst , File Operation: Prefetch , Latency: 51 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 395248
Access Time : 193351865164253 , File Name: 000026.sst , File Operation: Prefetch , Latency: 49 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 399322
Access Time : 193351865165461 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 222871 , IO Status: OK, Length: 135168, Offset: 401408
```
Reviewed By: anand1976
Differential Revision: D35601634
Pulled By: akankshamahajan15
fbshipit-source-id: 5a4f32a850af878efa0767bd5706380152a1f26e
2022-05-26 02:47:03 +00:00
|
|
|
|
2022-10-25 01:34:52 +00:00
|
|
|
const int kNumKeys = 1000;
|
|
|
|
std::shared_ptr<MockFS> fs = std::make_shared<MockFS>(
|
|
|
|
FileSystem::Default(), /*support_prefetch=*/false);
|
|
|
|
std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
|
Provide support for IOTracing for ReadAsync API (#9833)
Summary:
Same as title
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9833
Test Plan:
Add unit test and manually check the output of tracing logs
For fixed readahead_size it logs as:
```
Access Time : 193352113447923 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 15075 , IO Status: OK, Length: 12288, Offset: 659456
Access Time : 193352113465232 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 14425 , IO Status: OK, Length: 12288, Offset: 671744
Access Time : 193352113481539 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13062 , IO Status: OK, Length: 12288, Offset: 684032
Access Time : 193352113497692 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13649 , IO Status: OK, Length: 12288, Offset: 696320
Access Time : 193352113520043 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 19384 , IO Status: OK, Length: 12288, Offset: 708608
Access Time : 193352113538401 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 15406 , IO Status: OK, Length: 12288, Offset: 720896
Access Time : 193352113554855 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13670 , IO Status: OK, Length: 12288, Offset: 733184
Access Time : 193352113571624 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13855 , IO Status: OK, Length: 12288, Offset: 745472
Access Time : 193352113587924 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13953 , IO Status: OK, Length: 12288, Offset: 757760
Access Time : 193352113603285 , File Name: 000026.sst , File Operation: Prefetch , Latency: 59 , IO Status: Not implemented: Prefetch not supported, Length: 8868, Offset: 898349
```
For implicit readahead:
```
Access Time : 193351865156587 , File Name: 000026.sst , File Operation: Prefetch , Latency: 48 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 391174
Access Time : 193351865160354 , File Name: 000026.sst , File Operation: Prefetch , Latency: 51 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 395248
Access Time : 193351865164253 , File Name: 000026.sst , File Operation: Prefetch , Latency: 49 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 399322
Access Time : 193351865165461 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 222871 , IO Status: OK, Length: 135168, Offset: 401408
```
Reviewed By: anand1976
Differential Revision: D35601634
Pulled By: akankshamahajan15
fbshipit-source-id: 5a4f32a850af878efa0767bd5706380152a1f26e
2022-05-26 02:47:03 +00:00
|
|
|
|
2022-10-25 01:34:52 +00:00
|
|
|
bool use_direct_io = std::get<0>(GetParam());
|
2023-01-20 18:17:57 +00:00
|
|
|
Options options;
|
|
|
|
SetGenericOptions(env.get(), use_direct_io, options);
|
2022-10-25 01:34:52 +00:00
|
|
|
options.statistics = CreateDBStatistics();
|
|
|
|
BlockBasedTableOptions table_options;
|
2023-01-20 18:17:57 +00:00
|
|
|
SetBlockBasedTableOptions(table_options);
|
2022-10-25 01:34:52 +00:00
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
Provide support for IOTracing for ReadAsync API (#9833)
Summary:
Same as title
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9833
Test Plan:
Add unit test and manually check the output of tracing logs
For fixed readahead_size it logs as:
```
Access Time : 193352113447923 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 15075 , IO Status: OK, Length: 12288, Offset: 659456
Access Time : 193352113465232 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 14425 , IO Status: OK, Length: 12288, Offset: 671744
Access Time : 193352113481539 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13062 , IO Status: OK, Length: 12288, Offset: 684032
Access Time : 193352113497692 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13649 , IO Status: OK, Length: 12288, Offset: 696320
Access Time : 193352113520043 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 19384 , IO Status: OK, Length: 12288, Offset: 708608
Access Time : 193352113538401 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 15406 , IO Status: OK, Length: 12288, Offset: 720896
Access Time : 193352113554855 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13670 , IO Status: OK, Length: 12288, Offset: 733184
Access Time : 193352113571624 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13855 , IO Status: OK, Length: 12288, Offset: 745472
Access Time : 193352113587924 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13953 , IO Status: OK, Length: 12288, Offset: 757760
Access Time : 193352113603285 , File Name: 000026.sst , File Operation: Prefetch , Latency: 59 , IO Status: Not implemented: Prefetch not supported, Length: 8868, Offset: 898349
```
For implicit readahead:
```
Access Time : 193351865156587 , File Name: 000026.sst , File Operation: Prefetch , Latency: 48 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 391174
Access Time : 193351865160354 , File Name: 000026.sst , File Operation: Prefetch , Latency: 51 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 395248
Access Time : 193351865164253 , File Name: 000026.sst , File Operation: Prefetch , Latency: 49 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 399322
Access Time : 193351865165461 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 222871 , IO Status: OK, Length: 135168, Offset: 401408
```
Reviewed By: anand1976
Differential Revision: D35601634
Pulled By: akankshamahajan15
fbshipit-source-id: 5a4f32a850af878efa0767bd5706380152a1f26e
2022-05-26 02:47:03 +00:00
|
|
|
|
2022-10-25 01:34:52 +00:00
|
|
|
Status s = TryReopen(options);
|
|
|
|
if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) {
|
|
|
|
// If direct IO is not supported, skip the test
|
|
|
|
return;
|
|
|
|
} else {
|
|
|
|
ASSERT_OK(s);
|
|
|
|
}
|
2022-09-13 00:42:01 +00:00
|
|
|
|
2022-10-25 01:34:52 +00:00
|
|
|
int total_keys = 0;
|
|
|
|
// Write the keys.
|
|
|
|
{
|
|
|
|
WriteBatch batch;
|
|
|
|
Random rnd(309);
|
|
|
|
for (int j = 0; j < 5; j++) {
|
|
|
|
for (int i = j * kNumKeys; i < (j + 1) * kNumKeys; i++) {
|
|
|
|
ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000)));
|
|
|
|
total_keys++;
|
|
|
|
}
|
|
|
|
ASSERT_OK(db_->Write(WriteOptions(), &batch));
|
|
|
|
ASSERT_OK(Flush());
|
2022-09-13 00:42:01 +00:00
|
|
|
}
|
2022-10-25 01:34:52 +00:00
|
|
|
MoveFilesToLevel(2);
|
|
|
|
}
|
2023-05-03 16:37:21 +00:00
|
|
|
(void)total_keys;
|
2022-10-25 01:34:52 +00:00
|
|
|
|
|
|
|
int num_keys_first_batch = 0;
|
|
|
|
int num_keys_second_batch = 0;
|
|
|
|
// Calculate number of keys without async_io for correctness validation.
|
|
|
|
{
|
|
|
|
auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
|
|
|
|
// First Seek.
|
|
|
|
iter->Seek(BuildKey(450));
|
|
|
|
while (iter->Valid() && num_keys_first_batch < 100) {
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
num_keys_first_batch++;
|
|
|
|
iter->Next();
|
2022-09-13 00:42:01 +00:00
|
|
|
}
|
2022-10-25 01:34:52 +00:00
|
|
|
ASSERT_OK(iter->status());
|
2022-09-13 00:42:01 +00:00
|
|
|
|
2022-10-25 01:34:52 +00:00
|
|
|
iter->Seek(BuildKey(942));
|
|
|
|
while (iter->Valid()) {
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
num_keys_second_batch++;
|
|
|
|
iter->Next();
|
2022-09-13 00:42:01 +00:00
|
|
|
}
|
2022-10-25 01:34:52 +00:00
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
}
|
2022-09-13 00:42:01 +00:00
|
|
|
|
2022-10-25 01:34:52 +00:00
|
|
|
int buff_prefetch_count = 0;
|
|
|
|
bool read_async_called = false;
|
|
|
|
ReadOptions ro;
|
|
|
|
ro.adaptive_readahead = true;
|
|
|
|
ro.async_io = true;
|
|
|
|
|
|
|
|
if (std::get<1>(GetParam())) {
|
|
|
|
ro.readahead_size = 16 * 1024;
|
|
|
|
}
|
|
|
|
|
2024-01-05 17:29:01 +00:00
|
|
|
SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start",
|
|
|
|
[&](void*) { buff_prefetch_count++; });
|
2022-10-25 01:34:52 +00:00
|
|
|
|
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"UpdateResults::io_uring_result",
|
|
|
|
[&](void* /*arg*/) { read_async_called = true; });
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
// Read the keys using seek.
|
|
|
|
{
|
|
|
|
ASSERT_OK(options.statistics->Reset());
|
|
|
|
get_perf_context()->Reset();
|
|
|
|
|
|
|
|
auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ro));
|
|
|
|
int num_keys = 0;
|
|
|
|
// First Seek.
|
2022-09-13 00:42:01 +00:00
|
|
|
{
|
|
|
|
iter->Seek(BuildKey(450));
|
2022-10-25 01:34:52 +00:00
|
|
|
while (iter->Valid() && num_keys < 100) {
|
2022-09-13 00:42:01 +00:00
|
|
|
ASSERT_OK(iter->status());
|
2022-10-25 01:34:52 +00:00
|
|
|
num_keys++;
|
2022-09-13 00:42:01 +00:00
|
|
|
iter->Next();
|
|
|
|
}
|
2023-02-17 02:33:06 +00:00
|
|
|
|
2023-03-17 21:57:09 +00:00
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_EQ(num_keys, num_keys_first_batch);
|
|
|
|
// Check stats to make sure async prefetch is done.
|
|
|
|
HistogramData async_read_bytes;
|
|
|
|
options.statistics->histogramData(ASYNC_READ_BYTES, &async_read_bytes);
|
2023-02-17 02:33:06 +00:00
|
|
|
if (read_async_called) {
|
|
|
|
ASSERT_GT(async_read_bytes.count, 0);
|
|
|
|
ASSERT_GT(get_perf_context()->number_async_seek, 0);
|
|
|
|
} else {
|
2022-10-25 01:34:52 +00:00
|
|
|
// Not all platforms support iouring. In that case, ReadAsync in posix
|
|
|
|
// won't submit async requests.
|
2023-03-17 21:57:09 +00:00
|
|
|
ASSERT_EQ(async_read_bytes.count, 0);
|
|
|
|
ASSERT_EQ(get_perf_context()->number_async_seek, 0);
|
2022-09-13 00:42:01 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-10-25 01:34:52 +00:00
|
|
|
// Second Seek.
|
2022-09-13 00:42:01 +00:00
|
|
|
{
|
2022-10-25 01:34:52 +00:00
|
|
|
num_keys = 0;
|
2022-09-13 00:42:01 +00:00
|
|
|
ASSERT_OK(options.statistics->Reset());
|
|
|
|
get_perf_context()->Reset();
|
|
|
|
|
2022-10-25 01:34:52 +00:00
|
|
|
iter->Seek(BuildKey(942));
|
|
|
|
while (iter->Valid()) {
|
2022-09-13 00:42:01 +00:00
|
|
|
ASSERT_OK(iter->status());
|
2022-10-25 01:34:52 +00:00
|
|
|
num_keys++;
|
|
|
|
iter->Next();
|
2022-09-13 00:42:01 +00:00
|
|
|
}
|
|
|
|
|
2023-03-17 21:57:09 +00:00
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
ASSERT_EQ(num_keys, num_keys_second_batch);
|
|
|
|
HistogramData async_read_bytes;
|
|
|
|
options.statistics->histogramData(ASYNC_READ_BYTES, &async_read_bytes);
|
|
|
|
HistogramData prefetched_bytes_discarded;
|
|
|
|
options.statistics->histogramData(PREFETCHED_BYTES_DISCARDED,
|
|
|
|
&prefetched_bytes_discarded);
|
|
|
|
ASSERT_GT(prefetched_bytes_discarded.count, 0);
|
2022-10-25 01:34:52 +00:00
|
|
|
|
2023-03-17 21:57:09 +00:00
|
|
|
if (read_async_called) {
|
2023-02-17 02:33:06 +00:00
|
|
|
ASSERT_GT(buff_prefetch_count, 0);
|
|
|
|
|
|
|
|
// Check stats to make sure async prefetch is done.
|
|
|
|
ASSERT_GT(async_read_bytes.count, 0);
|
|
|
|
ASSERT_GT(get_perf_context()->number_async_seek, 0);
|
|
|
|
} else {
|
2022-10-25 01:34:52 +00:00
|
|
|
// Not all platforms support iouring. In that case, ReadAsync in posix
|
|
|
|
// won't submit async requests.
|
2023-03-17 21:57:09 +00:00
|
|
|
ASSERT_EQ(async_read_bytes.count, 0);
|
|
|
|
ASSERT_EQ(get_perf_context()->number_async_seek, 0);
|
2022-09-13 00:42:01 +00:00
|
|
|
}
|
|
|
|
}
|
2022-10-25 01:34:52 +00:00
|
|
|
}
|
2022-09-13 00:42:01 +00:00
|
|
|
|
2022-10-25 01:34:52 +00:00
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
SyncPoint::GetInstance()->ClearAllCallBacks();
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
2023-01-20 18:17:57 +00:00
|
|
|
// This test verifies implementation of seek parallelization with
|
|
|
|
// PosixFileSystem during prefetching.
|
|
|
|
TEST_P(PrefetchTest, SeekParallelizationTestWithPosix) {
|
2022-10-25 01:34:52 +00:00
|
|
|
if (mem_env_ || encrypted_env_) {
|
|
|
|
ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
|
|
|
|
return;
|
2022-09-13 00:42:01 +00:00
|
|
|
}
|
2022-10-25 01:34:52 +00:00
|
|
|
const int kNumKeys = 2000;
|
|
|
|
// Set options
|
|
|
|
std::shared_ptr<MockFS> fs = std::make_shared<MockFS>(
|
|
|
|
FileSystem::Default(), /*support_prefetch=*/false);
|
|
|
|
std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
|
2022-09-13 00:42:01 +00:00
|
|
|
|
2022-10-25 01:34:52 +00:00
|
|
|
bool use_direct_io = std::get<0>(GetParam());
|
2023-01-20 18:17:57 +00:00
|
|
|
Options options;
|
|
|
|
SetGenericOptions(env.get(), use_direct_io, options);
|
2022-10-25 01:34:52 +00:00
|
|
|
options.statistics = CreateDBStatistics();
|
|
|
|
BlockBasedTableOptions table_options;
|
2023-01-20 18:17:57 +00:00
|
|
|
SetBlockBasedTableOptions(table_options);
|
2022-10-25 01:34:52 +00:00
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
2022-09-13 00:42:01 +00:00
|
|
|
|
2022-10-25 01:34:52 +00:00
|
|
|
Status s = TryReopen(options);
|
|
|
|
if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) {
|
|
|
|
// If direct IO is not supported, skip the test
|
|
|
|
return;
|
|
|
|
} else {
|
|
|
|
ASSERT_OK(s);
|
|
|
|
}
|
2022-09-13 00:42:01 +00:00
|
|
|
|
2022-10-25 01:34:52 +00:00
|
|
|
WriteBatch batch;
|
|
|
|
Random rnd(309);
|
|
|
|
for (int i = 0; i < kNumKeys; i++) {
|
|
|
|
ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000)));
|
|
|
|
}
|
|
|
|
ASSERT_OK(db_->Write(WriteOptions(), &batch));
|
2022-09-13 00:42:01 +00:00
|
|
|
|
2022-10-25 01:34:52 +00:00
|
|
|
std::string start_key = BuildKey(0);
|
|
|
|
std::string end_key = BuildKey(kNumKeys - 1);
|
|
|
|
Slice least(start_key.data(), start_key.size());
|
|
|
|
Slice greatest(end_key.data(), end_key.size());
|
2022-09-13 00:42:01 +00:00
|
|
|
|
2022-10-25 01:34:52 +00:00
|
|
|
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest));
|
2022-09-13 00:42:01 +00:00
|
|
|
|
2022-10-25 01:34:52 +00:00
|
|
|
int buff_prefetch_count = 0;
|
2022-09-13 00:42:01 +00:00
|
|
|
|
2024-01-05 17:29:01 +00:00
|
|
|
SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start",
|
|
|
|
[&](void*) { buff_prefetch_count++; });
|
2022-09-13 00:42:01 +00:00
|
|
|
|
2022-10-25 01:34:52 +00:00
|
|
|
bool read_async_called = false;
|
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"UpdateResults::io_uring_result",
|
|
|
|
[&](void* /*arg*/) { read_async_called = true; });
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
2022-09-13 00:42:01 +00:00
|
|
|
|
2022-10-25 01:34:52 +00:00
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
ReadOptions ro;
|
|
|
|
ro.adaptive_readahead = true;
|
|
|
|
ro.async_io = true;
|
2022-09-13 00:42:01 +00:00
|
|
|
|
2022-10-25 01:34:52 +00:00
|
|
|
if (std::get<1>(GetParam())) {
|
|
|
|
ro.readahead_size = 16 * 1024;
|
|
|
|
}
|
2022-09-13 00:42:01 +00:00
|
|
|
|
2022-10-25 01:34:52 +00:00
|
|
|
{
|
|
|
|
ASSERT_OK(options.statistics->Reset());
|
|
|
|
// Each block contains around 4 keys.
|
|
|
|
auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ro));
|
|
|
|
iter->Seek(BuildKey(0)); // Prefetch data because of seek parallelization.
|
2023-03-17 21:57:09 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
2022-09-13 00:42:01 +00:00
|
|
|
|
2023-03-17 21:57:09 +00:00
|
|
|
// New data block. Since num_file_reads in FilePrefetch after this read is
|
|
|
|
// 2, it won't go for prefetching.
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
2022-09-13 00:42:01 +00:00
|
|
|
|
2023-03-17 21:57:09 +00:00
|
|
|
// Prefetch data.
|
|
|
|
iter->Next();
|
2022-10-25 01:34:52 +00:00
|
|
|
|
2023-03-17 21:57:09 +00:00
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
HistogramData async_read_bytes;
|
|
|
|
options.statistics->histogramData(ASYNC_READ_BYTES, &async_read_bytes);
|
|
|
|
if (read_async_called) {
|
|
|
|
ASSERT_GT(async_read_bytes.count, 0);
|
|
|
|
ASSERT_GT(get_perf_context()->number_async_seek, 0);
|
|
|
|
if (std::get<1>(GetParam())) {
|
|
|
|
ASSERT_EQ(buff_prefetch_count, 1);
|
2022-10-25 01:34:52 +00:00
|
|
|
} else {
|
2023-03-17 21:57:09 +00:00
|
|
|
ASSERT_EQ(buff_prefetch_count, 2);
|
2022-09-13 00:42:01 +00:00
|
|
|
}
|
2023-03-17 21:57:09 +00:00
|
|
|
} else {
|
|
|
|
// Not all platforms support iouring. In that case, ReadAsync in posix
|
|
|
|
// won't submit async requests.
|
|
|
|
ASSERT_EQ(async_read_bytes.count, 0);
|
|
|
|
ASSERT_EQ(get_perf_context()->number_async_seek, 0);
|
2022-09-13 00:42:01 +00:00
|
|
|
}
|
|
|
|
}
|
2022-10-25 01:34:52 +00:00
|
|
|
Close();
|
|
|
|
}
|
2022-09-13 00:42:01 +00:00
|
|
|
|
2022-07-06 18:42:59 +00:00
|
|
|
#ifdef GFLAGS
|
2023-01-20 18:17:57 +00:00
|
|
|
// This test verifies io_tracing with PosixFileSystem during prefetching.
|
2022-10-25 01:34:52 +00:00
|
|
|
TEST_P(PrefetchTest, TraceReadAsyncWithCallbackWrapper) {
|
|
|
|
if (mem_env_ || encrypted_env_) {
|
|
|
|
ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
|
|
|
|
return;
|
|
|
|
}
|
Provide support for IOTracing for ReadAsync API (#9833)
Summary:
Same as title
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9833
Test Plan:
Add unit test and manually check the output of tracing logs
For fixed readahead_size it logs as:
```
Access Time : 193352113447923 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 15075 , IO Status: OK, Length: 12288, Offset: 659456
Access Time : 193352113465232 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 14425 , IO Status: OK, Length: 12288, Offset: 671744
Access Time : 193352113481539 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13062 , IO Status: OK, Length: 12288, Offset: 684032
Access Time : 193352113497692 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13649 , IO Status: OK, Length: 12288, Offset: 696320
Access Time : 193352113520043 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 19384 , IO Status: OK, Length: 12288, Offset: 708608
Access Time : 193352113538401 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 15406 , IO Status: OK, Length: 12288, Offset: 720896
Access Time : 193352113554855 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13670 , IO Status: OK, Length: 12288, Offset: 733184
Access Time : 193352113571624 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13855 , IO Status: OK, Length: 12288, Offset: 745472
Access Time : 193352113587924 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13953 , IO Status: OK, Length: 12288, Offset: 757760
Access Time : 193352113603285 , File Name: 000026.sst , File Operation: Prefetch , Latency: 59 , IO Status: Not implemented: Prefetch not supported, Length: 8868, Offset: 898349
```
For implicit readahead:
```
Access Time : 193351865156587 , File Name: 000026.sst , File Operation: Prefetch , Latency: 48 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 391174
Access Time : 193351865160354 , File Name: 000026.sst , File Operation: Prefetch , Latency: 51 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 395248
Access Time : 193351865164253 , File Name: 000026.sst , File Operation: Prefetch , Latency: 49 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 399322
Access Time : 193351865165461 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 222871 , IO Status: OK, Length: 135168, Offset: 401408
```
Reviewed By: anand1976
Differential Revision: D35601634
Pulled By: akankshamahajan15
fbshipit-source-id: 5a4f32a850af878efa0767bd5706380152a1f26e
2022-05-26 02:47:03 +00:00
|
|
|
|
2022-10-25 01:34:52 +00:00
|
|
|
const int kNumKeys = 1000;
|
|
|
|
std::shared_ptr<MockFS> fs = std::make_shared<MockFS>(
|
|
|
|
FileSystem::Default(), /*support_prefetch=*/false);
|
|
|
|
std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
|
2022-07-06 18:42:59 +00:00
|
|
|
|
2022-10-25 01:34:52 +00:00
|
|
|
bool use_direct_io = std::get<0>(GetParam());
|
2023-01-20 18:17:57 +00:00
|
|
|
Options options;
|
|
|
|
SetGenericOptions(env.get(), use_direct_io, options);
|
2022-10-25 01:34:52 +00:00
|
|
|
options.statistics = CreateDBStatistics();
|
|
|
|
BlockBasedTableOptions table_options;
|
2023-01-20 18:17:57 +00:00
|
|
|
SetBlockBasedTableOptions(table_options);
|
2022-10-25 01:34:52 +00:00
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
|
|
|
|
Status s = TryReopen(options);
|
|
|
|
if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) {
|
|
|
|
// If direct IO is not supported, skip the test
|
|
|
|
return;
|
|
|
|
} else {
|
|
|
|
ASSERT_OK(s);
|
|
|
|
}
|
|
|
|
|
|
|
|
int total_keys = 0;
|
|
|
|
// Write the keys.
|
|
|
|
{
|
|
|
|
WriteBatch batch;
|
|
|
|
Random rnd(309);
|
|
|
|
for (int j = 0; j < 5; j++) {
|
|
|
|
for (int i = j * kNumKeys; i < (j + 1) * kNumKeys; i++) {
|
|
|
|
ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000)));
|
|
|
|
total_keys++;
|
Provide support for IOTracing for ReadAsync API (#9833)
Summary:
Same as title
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9833
Test Plan:
Add unit test and manually check the output of tracing logs
For fixed readahead_size it logs as:
```
Access Time : 193352113447923 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 15075 , IO Status: OK, Length: 12288, Offset: 659456
Access Time : 193352113465232 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 14425 , IO Status: OK, Length: 12288, Offset: 671744
Access Time : 193352113481539 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13062 , IO Status: OK, Length: 12288, Offset: 684032
Access Time : 193352113497692 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13649 , IO Status: OK, Length: 12288, Offset: 696320
Access Time : 193352113520043 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 19384 , IO Status: OK, Length: 12288, Offset: 708608
Access Time : 193352113538401 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 15406 , IO Status: OK, Length: 12288, Offset: 720896
Access Time : 193352113554855 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13670 , IO Status: OK, Length: 12288, Offset: 733184
Access Time : 193352113571624 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13855 , IO Status: OK, Length: 12288, Offset: 745472
Access Time : 193352113587924 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13953 , IO Status: OK, Length: 12288, Offset: 757760
Access Time : 193352113603285 , File Name: 000026.sst , File Operation: Prefetch , Latency: 59 , IO Status: Not implemented: Prefetch not supported, Length: 8868, Offset: 898349
```
For implicit readahead:
```
Access Time : 193351865156587 , File Name: 000026.sst , File Operation: Prefetch , Latency: 48 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 391174
Access Time : 193351865160354 , File Name: 000026.sst , File Operation: Prefetch , Latency: 51 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 395248
Access Time : 193351865164253 , File Name: 000026.sst , File Operation: Prefetch , Latency: 49 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 399322
Access Time : 193351865165461 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 222871 , IO Status: OK, Length: 135168, Offset: 401408
```
Reviewed By: anand1976
Differential Revision: D35601634
Pulled By: akankshamahajan15
fbshipit-source-id: 5a4f32a850af878efa0767bd5706380152a1f26e
2022-05-26 02:47:03 +00:00
|
|
|
}
|
2022-10-25 01:34:52 +00:00
|
|
|
ASSERT_OK(db_->Write(WriteOptions(), &batch));
|
|
|
|
ASSERT_OK(Flush());
|
Provide support for IOTracing for ReadAsync API (#9833)
Summary:
Same as title
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9833
Test Plan:
Add unit test and manually check the output of tracing logs
For fixed readahead_size it logs as:
```
Access Time : 193352113447923 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 15075 , IO Status: OK, Length: 12288, Offset: 659456
Access Time : 193352113465232 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 14425 , IO Status: OK, Length: 12288, Offset: 671744
Access Time : 193352113481539 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13062 , IO Status: OK, Length: 12288, Offset: 684032
Access Time : 193352113497692 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13649 , IO Status: OK, Length: 12288, Offset: 696320
Access Time : 193352113520043 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 19384 , IO Status: OK, Length: 12288, Offset: 708608
Access Time : 193352113538401 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 15406 , IO Status: OK, Length: 12288, Offset: 720896
Access Time : 193352113554855 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13670 , IO Status: OK, Length: 12288, Offset: 733184
Access Time : 193352113571624 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13855 , IO Status: OK, Length: 12288, Offset: 745472
Access Time : 193352113587924 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13953 , IO Status: OK, Length: 12288, Offset: 757760
Access Time : 193352113603285 , File Name: 000026.sst , File Operation: Prefetch , Latency: 59 , IO Status: Not implemented: Prefetch not supported, Length: 8868, Offset: 898349
```
For implicit readahead:
```
Access Time : 193351865156587 , File Name: 000026.sst , File Operation: Prefetch , Latency: 48 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 391174
Access Time : 193351865160354 , File Name: 000026.sst , File Operation: Prefetch , Latency: 51 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 395248
Access Time : 193351865164253 , File Name: 000026.sst , File Operation: Prefetch , Latency: 49 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 399322
Access Time : 193351865165461 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 222871 , IO Status: OK, Length: 135168, Offset: 401408
```
Reviewed By: anand1976
Differential Revision: D35601634
Pulled By: akankshamahajan15
fbshipit-source-id: 5a4f32a850af878efa0767bd5706380152a1f26e
2022-05-26 02:47:03 +00:00
|
|
|
}
|
2022-10-25 01:34:52 +00:00
|
|
|
MoveFilesToLevel(2);
|
|
|
|
}
|
Provide support for IOTracing for ReadAsync API (#9833)
Summary:
Same as title
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9833
Test Plan:
Add unit test and manually check the output of tracing logs
For fixed readahead_size it logs as:
```
Access Time : 193352113447923 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 15075 , IO Status: OK, Length: 12288, Offset: 659456
Access Time : 193352113465232 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 14425 , IO Status: OK, Length: 12288, Offset: 671744
Access Time : 193352113481539 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13062 , IO Status: OK, Length: 12288, Offset: 684032
Access Time : 193352113497692 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13649 , IO Status: OK, Length: 12288, Offset: 696320
Access Time : 193352113520043 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 19384 , IO Status: OK, Length: 12288, Offset: 708608
Access Time : 193352113538401 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 15406 , IO Status: OK, Length: 12288, Offset: 720896
Access Time : 193352113554855 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13670 , IO Status: OK, Length: 12288, Offset: 733184
Access Time : 193352113571624 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13855 , IO Status: OK, Length: 12288, Offset: 745472
Access Time : 193352113587924 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13953 , IO Status: OK, Length: 12288, Offset: 757760
Access Time : 193352113603285 , File Name: 000026.sst , File Operation: Prefetch , Latency: 59 , IO Status: Not implemented: Prefetch not supported, Length: 8868, Offset: 898349
```
For implicit readahead:
```
Access Time : 193351865156587 , File Name: 000026.sst , File Operation: Prefetch , Latency: 48 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 391174
Access Time : 193351865160354 , File Name: 000026.sst , File Operation: Prefetch , Latency: 51 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 395248
Access Time : 193351865164253 , File Name: 000026.sst , File Operation: Prefetch , Latency: 49 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 399322
Access Time : 193351865165461 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 222871 , IO Status: OK, Length: 135168, Offset: 401408
```
Reviewed By: anand1976
Differential Revision: D35601634
Pulled By: akankshamahajan15
fbshipit-source-id: 5a4f32a850af878efa0767bd5706380152a1f26e
2022-05-26 02:47:03 +00:00
|
|
|
|
2022-10-25 01:34:52 +00:00
|
|
|
int buff_prefetch_count = 0;
|
|
|
|
bool read_async_called = false;
|
|
|
|
ReadOptions ro;
|
|
|
|
ro.adaptive_readahead = true;
|
|
|
|
ro.async_io = true;
|
Provide support for IOTracing for ReadAsync API (#9833)
Summary:
Same as title
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9833
Test Plan:
Add unit test and manually check the output of tracing logs
For fixed readahead_size it logs as:
```
Access Time : 193352113447923 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 15075 , IO Status: OK, Length: 12288, Offset: 659456
Access Time : 193352113465232 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 14425 , IO Status: OK, Length: 12288, Offset: 671744
Access Time : 193352113481539 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13062 , IO Status: OK, Length: 12288, Offset: 684032
Access Time : 193352113497692 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13649 , IO Status: OK, Length: 12288, Offset: 696320
Access Time : 193352113520043 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 19384 , IO Status: OK, Length: 12288, Offset: 708608
Access Time : 193352113538401 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 15406 , IO Status: OK, Length: 12288, Offset: 720896
Access Time : 193352113554855 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13670 , IO Status: OK, Length: 12288, Offset: 733184
Access Time : 193352113571624 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13855 , IO Status: OK, Length: 12288, Offset: 745472
Access Time : 193352113587924 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13953 , IO Status: OK, Length: 12288, Offset: 757760
Access Time : 193352113603285 , File Name: 000026.sst , File Operation: Prefetch , Latency: 59 , IO Status: Not implemented: Prefetch not supported, Length: 8868, Offset: 898349
```
For implicit readahead:
```
Access Time : 193351865156587 , File Name: 000026.sst , File Operation: Prefetch , Latency: 48 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 391174
Access Time : 193351865160354 , File Name: 000026.sst , File Operation: Prefetch , Latency: 51 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 395248
Access Time : 193351865164253 , File Name: 000026.sst , File Operation: Prefetch , Latency: 49 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 399322
Access Time : 193351865165461 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 222871 , IO Status: OK, Length: 135168, Offset: 401408
```
Reviewed By: anand1976
Differential Revision: D35601634
Pulled By: akankshamahajan15
fbshipit-source-id: 5a4f32a850af878efa0767bd5706380152a1f26e
2022-05-26 02:47:03 +00:00
|
|
|
|
2022-10-25 01:34:52 +00:00
|
|
|
if (std::get<1>(GetParam())) {
|
|
|
|
ro.readahead_size = 16 * 1024;
|
|
|
|
}
|
Provide support for IOTracing for ReadAsync API (#9833)
Summary:
Same as title
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9833
Test Plan:
Add unit test and manually check the output of tracing logs
For fixed readahead_size it logs as:
```
Access Time : 193352113447923 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 15075 , IO Status: OK, Length: 12288, Offset: 659456
Access Time : 193352113465232 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 14425 , IO Status: OK, Length: 12288, Offset: 671744
Access Time : 193352113481539 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13062 , IO Status: OK, Length: 12288, Offset: 684032
Access Time : 193352113497692 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13649 , IO Status: OK, Length: 12288, Offset: 696320
Access Time : 193352113520043 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 19384 , IO Status: OK, Length: 12288, Offset: 708608
Access Time : 193352113538401 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 15406 , IO Status: OK, Length: 12288, Offset: 720896
Access Time : 193352113554855 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13670 , IO Status: OK, Length: 12288, Offset: 733184
Access Time : 193352113571624 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13855 , IO Status: OK, Length: 12288, Offset: 745472
Access Time : 193352113587924 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13953 , IO Status: OK, Length: 12288, Offset: 757760
Access Time : 193352113603285 , File Name: 000026.sst , File Operation: Prefetch , Latency: 59 , IO Status: Not implemented: Prefetch not supported, Length: 8868, Offset: 898349
```
For implicit readahead:
```
Access Time : 193351865156587 , File Name: 000026.sst , File Operation: Prefetch , Latency: 48 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 391174
Access Time : 193351865160354 , File Name: 000026.sst , File Operation: Prefetch , Latency: 51 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 395248
Access Time : 193351865164253 , File Name: 000026.sst , File Operation: Prefetch , Latency: 49 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 399322
Access Time : 193351865165461 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 222871 , IO Status: OK, Length: 135168, Offset: 401408
```
Reviewed By: anand1976
Differential Revision: D35601634
Pulled By: akankshamahajan15
fbshipit-source-id: 5a4f32a850af878efa0767bd5706380152a1f26e
2022-05-26 02:47:03 +00:00
|
|
|
|
2024-01-05 17:29:01 +00:00
|
|
|
SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start",
|
|
|
|
[&](void*) { buff_prefetch_count++; });
|
Provide support for IOTracing for ReadAsync API (#9833)
Summary:
Same as title
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9833
Test Plan:
Add unit test and manually check the output of tracing logs
For fixed readahead_size it logs as:
```
Access Time : 193352113447923 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 15075 , IO Status: OK, Length: 12288, Offset: 659456
Access Time : 193352113465232 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 14425 , IO Status: OK, Length: 12288, Offset: 671744
Access Time : 193352113481539 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13062 , IO Status: OK, Length: 12288, Offset: 684032
Access Time : 193352113497692 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13649 , IO Status: OK, Length: 12288, Offset: 696320
Access Time : 193352113520043 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 19384 , IO Status: OK, Length: 12288, Offset: 708608
Access Time : 193352113538401 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 15406 , IO Status: OK, Length: 12288, Offset: 720896
Access Time : 193352113554855 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13670 , IO Status: OK, Length: 12288, Offset: 733184
Access Time : 193352113571624 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13855 , IO Status: OK, Length: 12288, Offset: 745472
Access Time : 193352113587924 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13953 , IO Status: OK, Length: 12288, Offset: 757760
Access Time : 193352113603285 , File Name: 000026.sst , File Operation: Prefetch , Latency: 59 , IO Status: Not implemented: Prefetch not supported, Length: 8868, Offset: 898349
```
For implicit readahead:
```
Access Time : 193351865156587 , File Name: 000026.sst , File Operation: Prefetch , Latency: 48 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 391174
Access Time : 193351865160354 , File Name: 000026.sst , File Operation: Prefetch , Latency: 51 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 395248
Access Time : 193351865164253 , File Name: 000026.sst , File Operation: Prefetch , Latency: 49 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 399322
Access Time : 193351865165461 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 222871 , IO Status: OK, Length: 135168, Offset: 401408
```
Reviewed By: anand1976
Differential Revision: D35601634
Pulled By: akankshamahajan15
fbshipit-source-id: 5a4f32a850af878efa0767bd5706380152a1f26e
2022-05-26 02:47:03 +00:00
|
|
|
|
2022-10-25 01:34:52 +00:00
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"UpdateResults::io_uring_result",
|
|
|
|
[&](void* /*arg*/) { read_async_called = true; });
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
Provide support for IOTracing for ReadAsync API (#9833)
Summary:
Same as title
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9833
Test Plan:
Add unit test and manually check the output of tracing logs
For fixed readahead_size it logs as:
```
Access Time : 193352113447923 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 15075 , IO Status: OK, Length: 12288, Offset: 659456
Access Time : 193352113465232 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 14425 , IO Status: OK, Length: 12288, Offset: 671744
Access Time : 193352113481539 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13062 , IO Status: OK, Length: 12288, Offset: 684032
Access Time : 193352113497692 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13649 , IO Status: OK, Length: 12288, Offset: 696320
Access Time : 193352113520043 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 19384 , IO Status: OK, Length: 12288, Offset: 708608
Access Time : 193352113538401 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 15406 , IO Status: OK, Length: 12288, Offset: 720896
Access Time : 193352113554855 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13670 , IO Status: OK, Length: 12288, Offset: 733184
Access Time : 193352113571624 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13855 , IO Status: OK, Length: 12288, Offset: 745472
Access Time : 193352113587924 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13953 , IO Status: OK, Length: 12288, Offset: 757760
Access Time : 193352113603285 , File Name: 000026.sst , File Operation: Prefetch , Latency: 59 , IO Status: Not implemented: Prefetch not supported, Length: 8868, Offset: 898349
```
For implicit readahead:
```
Access Time : 193351865156587 , File Name: 000026.sst , File Operation: Prefetch , Latency: 48 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 391174
Access Time : 193351865160354 , File Name: 000026.sst , File Operation: Prefetch , Latency: 51 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 395248
Access Time : 193351865164253 , File Name: 000026.sst , File Operation: Prefetch , Latency: 49 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 399322
Access Time : 193351865165461 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 222871 , IO Status: OK, Length: 135168, Offset: 401408
```
Reviewed By: anand1976
Differential Revision: D35601634
Pulled By: akankshamahajan15
fbshipit-source-id: 5a4f32a850af878efa0767bd5706380152a1f26e
2022-05-26 02:47:03 +00:00
|
|
|
|
2022-10-25 01:34:52 +00:00
|
|
|
// Read the keys.
|
|
|
|
{
|
|
|
|
// Start io_tracing.
|
|
|
|
WriteOptions write_opt;
|
|
|
|
TraceOptions trace_opt;
|
|
|
|
std::unique_ptr<TraceWriter> trace_writer;
|
|
|
|
std::string trace_file_path = dbname_ + "/io_trace_file";
|
|
|
|
|
|
|
|
ASSERT_OK(
|
|
|
|
NewFileTraceWriter(env_, EnvOptions(), trace_file_path, &trace_writer));
|
|
|
|
ASSERT_OK(db_->StartIOTrace(trace_opt, std::move(trace_writer)));
|
|
|
|
ASSERT_OK(options.statistics->Reset());
|
Provide support for IOTracing for ReadAsync API (#9833)
Summary:
Same as title
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9833
Test Plan:
Add unit test and manually check the output of tracing logs
For fixed readahead_size it logs as:
```
Access Time : 193352113447923 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 15075 , IO Status: OK, Length: 12288, Offset: 659456
Access Time : 193352113465232 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 14425 , IO Status: OK, Length: 12288, Offset: 671744
Access Time : 193352113481539 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13062 , IO Status: OK, Length: 12288, Offset: 684032
Access Time : 193352113497692 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13649 , IO Status: OK, Length: 12288, Offset: 696320
Access Time : 193352113520043 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 19384 , IO Status: OK, Length: 12288, Offset: 708608
Access Time : 193352113538401 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 15406 , IO Status: OK, Length: 12288, Offset: 720896
Access Time : 193352113554855 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13670 , IO Status: OK, Length: 12288, Offset: 733184
Access Time : 193352113571624 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13855 , IO Status: OK, Length: 12288, Offset: 745472
Access Time : 193352113587924 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13953 , IO Status: OK, Length: 12288, Offset: 757760
Access Time : 193352113603285 , File Name: 000026.sst , File Operation: Prefetch , Latency: 59 , IO Status: Not implemented: Prefetch not supported, Length: 8868, Offset: 898349
```
For implicit readahead:
```
Access Time : 193351865156587 , File Name: 000026.sst , File Operation: Prefetch , Latency: 48 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 391174
Access Time : 193351865160354 , File Name: 000026.sst , File Operation: Prefetch , Latency: 51 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 395248
Access Time : 193351865164253 , File Name: 000026.sst , File Operation: Prefetch , Latency: 49 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 399322
Access Time : 193351865165461 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 222871 , IO Status: OK, Length: 135168, Offset: 401408
```
Reviewed By: anand1976
Differential Revision: D35601634
Pulled By: akankshamahajan15
fbshipit-source-id: 5a4f32a850af878efa0767bd5706380152a1f26e
2022-05-26 02:47:03 +00:00
|
|
|
|
2022-10-25 01:34:52 +00:00
|
|
|
auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ro));
|
|
|
|
int num_keys = 0;
|
|
|
|
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
num_keys++;
|
|
|
|
}
|
2023-10-18 16:38:38 +00:00
|
|
|
ASSERT_OK(iter->status());
|
Provide support for IOTracing for ReadAsync API (#9833)
Summary:
Same as title
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9833
Test Plan:
Add unit test and manually check the output of tracing logs
For fixed readahead_size it logs as:
```
Access Time : 193352113447923 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 15075 , IO Status: OK, Length: 12288, Offset: 659456
Access Time : 193352113465232 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 14425 , IO Status: OK, Length: 12288, Offset: 671744
Access Time : 193352113481539 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13062 , IO Status: OK, Length: 12288, Offset: 684032
Access Time : 193352113497692 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13649 , IO Status: OK, Length: 12288, Offset: 696320
Access Time : 193352113520043 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 19384 , IO Status: OK, Length: 12288, Offset: 708608
Access Time : 193352113538401 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 15406 , IO Status: OK, Length: 12288, Offset: 720896
Access Time : 193352113554855 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13670 , IO Status: OK, Length: 12288, Offset: 733184
Access Time : 193352113571624 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13855 , IO Status: OK, Length: 12288, Offset: 745472
Access Time : 193352113587924 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13953 , IO Status: OK, Length: 12288, Offset: 757760
Access Time : 193352113603285 , File Name: 000026.sst , File Operation: Prefetch , Latency: 59 , IO Status: Not implemented: Prefetch not supported, Length: 8868, Offset: 898349
```
For implicit readahead:
```
Access Time : 193351865156587 , File Name: 000026.sst , File Operation: Prefetch , Latency: 48 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 391174
Access Time : 193351865160354 , File Name: 000026.sst , File Operation: Prefetch , Latency: 51 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 395248
Access Time : 193351865164253 , File Name: 000026.sst , File Operation: Prefetch , Latency: 49 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 399322
Access Time : 193351865165461 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 222871 , IO Status: OK, Length: 135168, Offset: 401408
```
Reviewed By: anand1976
Differential Revision: D35601634
Pulled By: akankshamahajan15
fbshipit-source-id: 5a4f32a850af878efa0767bd5706380152a1f26e
2022-05-26 02:47:03 +00:00
|
|
|
|
2022-10-25 01:34:52 +00:00
|
|
|
// End the tracing.
|
|
|
|
ASSERT_OK(db_->EndIOTrace());
|
|
|
|
ASSERT_OK(env_->FileExists(trace_file_path));
|
Provide support for IOTracing for ReadAsync API (#9833)
Summary:
Same as title
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9833
Test Plan:
Add unit test and manually check the output of tracing logs
For fixed readahead_size it logs as:
```
Access Time : 193352113447923 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 15075 , IO Status: OK, Length: 12288, Offset: 659456
Access Time : 193352113465232 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 14425 , IO Status: OK, Length: 12288, Offset: 671744
Access Time : 193352113481539 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13062 , IO Status: OK, Length: 12288, Offset: 684032
Access Time : 193352113497692 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13649 , IO Status: OK, Length: 12288, Offset: 696320
Access Time : 193352113520043 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 19384 , IO Status: OK, Length: 12288, Offset: 708608
Access Time : 193352113538401 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 15406 , IO Status: OK, Length: 12288, Offset: 720896
Access Time : 193352113554855 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13670 , IO Status: OK, Length: 12288, Offset: 733184
Access Time : 193352113571624 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13855 , IO Status: OK, Length: 12288, Offset: 745472
Access Time : 193352113587924 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13953 , IO Status: OK, Length: 12288, Offset: 757760
Access Time : 193352113603285 , File Name: 000026.sst , File Operation: Prefetch , Latency: 59 , IO Status: Not implemented: Prefetch not supported, Length: 8868, Offset: 898349
```
For implicit readahead:
```
Access Time : 193351865156587 , File Name: 000026.sst , File Operation: Prefetch , Latency: 48 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 391174
Access Time : 193351865160354 , File Name: 000026.sst , File Operation: Prefetch , Latency: 51 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 395248
Access Time : 193351865164253 , File Name: 000026.sst , File Operation: Prefetch , Latency: 49 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 399322
Access Time : 193351865165461 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 222871 , IO Status: OK, Length: 135168, Offset: 401408
```
Reviewed By: anand1976
Differential Revision: D35601634
Pulled By: akankshamahajan15
fbshipit-source-id: 5a4f32a850af878efa0767bd5706380152a1f26e
2022-05-26 02:47:03 +00:00
|
|
|
|
2023-03-17 21:57:09 +00:00
|
|
|
ASSERT_EQ(num_keys, total_keys);
|
|
|
|
HistogramData async_read_bytes;
|
|
|
|
options.statistics->histogramData(ASYNC_READ_BYTES, &async_read_bytes);
|
2023-02-17 02:33:06 +00:00
|
|
|
if (read_async_called) {
|
|
|
|
ASSERT_GT(buff_prefetch_count, 0);
|
|
|
|
// Check stats to make sure async prefetch is done.
|
|
|
|
ASSERT_GT(async_read_bytes.count, 0);
|
|
|
|
} else {
|
2022-10-25 01:34:52 +00:00
|
|
|
// Not all platforms support iouring. In that case, ReadAsync in posix
|
|
|
|
// won't submit async requests.
|
2023-03-17 21:57:09 +00:00
|
|
|
ASSERT_EQ(async_read_bytes.count, 0);
|
2022-07-06 18:42:59 +00:00
|
|
|
}
|
Provide support for IOTracing for ReadAsync API (#9833)
Summary:
Same as title
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9833
Test Plan:
Add unit test and manually check the output of tracing logs
For fixed readahead_size it logs as:
```
Access Time : 193352113447923 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 15075 , IO Status: OK, Length: 12288, Offset: 659456
Access Time : 193352113465232 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 14425 , IO Status: OK, Length: 12288, Offset: 671744
Access Time : 193352113481539 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13062 , IO Status: OK, Length: 12288, Offset: 684032
Access Time : 193352113497692 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13649 , IO Status: OK, Length: 12288, Offset: 696320
Access Time : 193352113520043 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 19384 , IO Status: OK, Length: 12288, Offset: 708608
Access Time : 193352113538401 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 15406 , IO Status: OK, Length: 12288, Offset: 720896
Access Time : 193352113554855 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13670 , IO Status: OK, Length: 12288, Offset: 733184
Access Time : 193352113571624 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13855 , IO Status: OK, Length: 12288, Offset: 745472
Access Time : 193352113587924 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13953 , IO Status: OK, Length: 12288, Offset: 757760
Access Time : 193352113603285 , File Name: 000026.sst , File Operation: Prefetch , Latency: 59 , IO Status: Not implemented: Prefetch not supported, Length: 8868, Offset: 898349
```
For implicit readahead:
```
Access Time : 193351865156587 , File Name: 000026.sst , File Operation: Prefetch , Latency: 48 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 391174
Access Time : 193351865160354 , File Name: 000026.sst , File Operation: Prefetch , Latency: 51 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 395248
Access Time : 193351865164253 , File Name: 000026.sst , File Operation: Prefetch , Latency: 49 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 399322
Access Time : 193351865165461 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 222871 , IO Status: OK, Length: 135168, Offset: 401408
```
Reviewed By: anand1976
Differential Revision: D35601634
Pulled By: akankshamahajan15
fbshipit-source-id: 5a4f32a850af878efa0767bd5706380152a1f26e
2022-05-26 02:47:03 +00:00
|
|
|
|
2022-10-25 01:34:52 +00:00
|
|
|
// Check the file to see if ReadAsync is logged.
|
|
|
|
RunIOTracerParserTool(trace_file_path);
|
2022-07-06 18:42:59 +00:00
|
|
|
}
|
2022-10-25 01:34:52 +00:00
|
|
|
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
SyncPoint::GetInstance()->ClearAllCallBacks();
|
|
|
|
|
|
|
|
Close();
|
|
|
|
}
|
Provide support for IOTracing for ReadAsync API (#9833)
Summary:
Same as title
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9833
Test Plan:
Add unit test and manually check the output of tracing logs
For fixed readahead_size it logs as:
```
Access Time : 193352113447923 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 15075 , IO Status: OK, Length: 12288, Offset: 659456
Access Time : 193352113465232 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 14425 , IO Status: OK, Length: 12288, Offset: 671744
Access Time : 193352113481539 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13062 , IO Status: OK, Length: 12288, Offset: 684032
Access Time : 193352113497692 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13649 , IO Status: OK, Length: 12288, Offset: 696320
Access Time : 193352113520043 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 19384 , IO Status: OK, Length: 12288, Offset: 708608
Access Time : 193352113538401 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 15406 , IO Status: OK, Length: 12288, Offset: 720896
Access Time : 193352113554855 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13670 , IO Status: OK, Length: 12288, Offset: 733184
Access Time : 193352113571624 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13855 , IO Status: OK, Length: 12288, Offset: 745472
Access Time : 193352113587924 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 13953 , IO Status: OK, Length: 12288, Offset: 757760
Access Time : 193352113603285 , File Name: 000026.sst , File Operation: Prefetch , Latency: 59 , IO Status: Not implemented: Prefetch not supported, Length: 8868, Offset: 898349
```
For implicit readahead:
```
Access Time : 193351865156587 , File Name: 000026.sst , File Operation: Prefetch , Latency: 48 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 391174
Access Time : 193351865160354 , File Name: 000026.sst , File Operation: Prefetch , Latency: 51 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 395248
Access Time : 193351865164253 , File Name: 000026.sst , File Operation: Prefetch , Latency: 49 , IO Status: Not implemented: Prefetch not supported, Length: 12266, Offset: 399322
Access Time : 193351865165461 , File Name: 000026.sst , File Operation: ReadAsync , Latency: 222871 , IO Status: OK, Length: 135168, Offset: 401408
```
Reviewed By: anand1976
Differential Revision: D35601634
Pulled By: akankshamahajan15
fbshipit-source-id: 5a4f32a850af878efa0767bd5706380152a1f26e
2022-05-26 02:47:03 +00:00
|
|
|
#endif // GFLAGS
|
2023-01-13 02:09:07 +00:00
|
|
|
|
|
|
|
class FilePrefetchBufferTest : public testing::Test {
|
|
|
|
public:
|
|
|
|
void SetUp() override {
|
|
|
|
SetupSyncPointsToMockDirectIO();
|
|
|
|
env_ = Env::Default();
|
|
|
|
fs_ = FileSystem::Default();
|
|
|
|
test_dir_ = test::PerThreadDBPath("file_prefetch_buffer_test");
|
|
|
|
ASSERT_OK(fs_->CreateDir(test_dir_, IOOptions(), nullptr));
|
2023-01-20 18:17:57 +00:00
|
|
|
stats_ = CreateDBStatistics();
|
2023-01-13 02:09:07 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void TearDown() override { EXPECT_OK(DestroyDir(env_, test_dir_)); }
|
|
|
|
|
|
|
|
void Write(const std::string& fname, const std::string& content) {
|
|
|
|
std::unique_ptr<FSWritableFile> f;
|
|
|
|
ASSERT_OK(fs_->NewWritableFile(Path(fname), FileOptions(), &f, nullptr));
|
|
|
|
ASSERT_OK(f->Append(content, IOOptions(), nullptr));
|
|
|
|
ASSERT_OK(f->Close(IOOptions(), nullptr));
|
|
|
|
}
|
|
|
|
|
|
|
|
void Read(const std::string& fname, const FileOptions& opts,
|
|
|
|
std::unique_ptr<RandomAccessFileReader>* reader) {
|
|
|
|
std::string fpath = Path(fname);
|
|
|
|
std::unique_ptr<FSRandomAccessFile> f;
|
|
|
|
ASSERT_OK(fs_->NewRandomAccessFile(fpath, opts, &f, nullptr));
|
2023-01-20 18:17:57 +00:00
|
|
|
reader->reset(new RandomAccessFileReader(
|
|
|
|
std::move(f), fpath, env_->GetSystemClock().get(),
|
|
|
|
/*io_tracer=*/nullptr, stats_.get()));
|
2023-01-13 02:09:07 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void AssertResult(const std::string& content,
|
|
|
|
const std::vector<FSReadRequest>& reqs) {
|
|
|
|
for (const auto& r : reqs) {
|
|
|
|
ASSERT_OK(r.status);
|
|
|
|
ASSERT_EQ(r.len, r.result.size());
|
|
|
|
ASSERT_EQ(content.substr(r.offset, r.len), r.result.ToString());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
FileSystem* fs() { return fs_.get(); }
|
2023-01-20 18:17:57 +00:00
|
|
|
Statistics* stats() { return stats_.get(); }
|
2023-01-13 02:09:07 +00:00
|
|
|
|
|
|
|
private:
|
|
|
|
Env* env_;
|
|
|
|
std::shared_ptr<FileSystem> fs_;
|
|
|
|
std::string test_dir_;
|
2023-01-20 18:17:57 +00:00
|
|
|
std::shared_ptr<Statistics> stats_;
|
2023-01-13 02:09:07 +00:00
|
|
|
|
|
|
|
std::string Path(const std::string& fname) { return test_dir_ + "/" + fname; }
|
|
|
|
};
|
|
|
|
|
|
|
|
TEST_F(FilePrefetchBufferTest, SeekWithBlockCacheHit) {
|
|
|
|
std::string fname = "seek-with-block-cache-hit";
|
|
|
|
Random rand(0);
|
|
|
|
std::string content = rand.RandomString(32768);
|
|
|
|
Write(fname, content);
|
|
|
|
|
|
|
|
FileOptions opts;
|
|
|
|
std::unique_ptr<RandomAccessFileReader> r;
|
|
|
|
Read(fname, opts, &r);
|
|
|
|
|
2024-01-05 17:29:01 +00:00
|
|
|
ReadaheadParams readahead_params;
|
|
|
|
readahead_params.initial_readahead_size = 16384;
|
|
|
|
readahead_params.max_readahead_size = 16384;
|
|
|
|
|
|
|
|
FilePrefetchBuffer fpb(readahead_params, true, false, fs());
|
2023-01-13 02:09:07 +00:00
|
|
|
Slice result;
|
|
|
|
// Simulate a seek of 4096 bytes at offset 0. Due to the readahead settings,
|
|
|
|
// it will do two reads of 4096+8192 and 8192
|
|
|
|
Status s = fpb.PrefetchAsync(IOOptions(), r.get(), 0, 4096, &result);
|
2023-02-17 02:33:06 +00:00
|
|
|
|
|
|
|
// Platforms that don't have IO uring may not support async IO.
|
|
|
|
if (s.IsNotSupported()) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
ASSERT_TRUE(s.IsTryAgain());
|
2023-01-13 02:09:07 +00:00
|
|
|
// Simulate a block cache hit
|
|
|
|
fpb.UpdateReadPattern(0, 4096, false);
|
|
|
|
// Now read some data that straddles the two prefetch buffers - offset 8192 to
|
|
|
|
// 16384
|
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444)
Summary:
**Context/Summary:**
- Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`.
- For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation)
- New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main.
- Misc
- More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority`
- Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444
Test Plan:
- CI fake db crash/stress test
- Microbenchmarking
**Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench`
- google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd
- db_basic_bench_base: upstream
- db_basic_bench_pr: db_basic_bench_base + this PR
- asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read)
- asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR
**Test**
Get
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000
```
Result
```
Coming soon
```
AsyncRead
```
TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out
```
Result
```
Base:
1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008,
PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before):
1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053,
```
Reviewed By: ajkr
Differential Revision: D45918925
Pulled By: hx235
fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
|
|
|
IOOptions io_opts;
|
|
|
|
io_opts.rate_limiter_priority = Env::IOPriority::IO_LOW;
|
2024-01-05 17:29:01 +00:00
|
|
|
ASSERT_TRUE(fpb.TryReadFromCache(io_opts, r.get(), 8192, 8192, &result, &s));
|
2023-01-13 02:09:07 +00:00
|
|
|
}
|
2023-01-20 18:17:57 +00:00
|
|
|
|
2023-09-11 18:41:44 +00:00
|
|
|
// Test to ensure when PrefetchAsync is called during seek, it doesn't do any
|
|
|
|
// alignment or prefetch extra if readahead is not enabled during seek.
|
|
|
|
TEST_F(FilePrefetchBufferTest, SeekWithoutAlignment) {
|
2024-01-05 17:29:01 +00:00
|
|
|
std::string fname = "seek-without-alignment";
|
2023-09-11 18:41:44 +00:00
|
|
|
Random rand(0);
|
|
|
|
std::string content = rand.RandomString(32768);
|
|
|
|
Write(fname, content);
|
|
|
|
|
|
|
|
FileOptions opts;
|
|
|
|
std::unique_ptr<RandomAccessFileReader> r;
|
|
|
|
Read(fname, opts, &r);
|
|
|
|
|
|
|
|
size_t alignment = r->file()->GetRequiredBufferAlignment();
|
|
|
|
size_t n = alignment / 2;
|
|
|
|
|
|
|
|
int read_async_called = 0;
|
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"FilePrefetchBuffer::ReadAsync",
|
|
|
|
[&](void* /*arg*/) { read_async_called++; });
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
// Without readahead enabled, there will be no alignment and offset of buffer
|
|
|
|
// will be n.
|
|
|
|
{
|
2024-01-05 17:29:01 +00:00
|
|
|
ReadaheadParams readahead_params;
|
|
|
|
readahead_params.initial_readahead_size = 8192;
|
|
|
|
readahead_params.max_readahead_size = 16384;
|
|
|
|
readahead_params.implicit_auto_readahead = true;
|
|
|
|
readahead_params.num_file_reads_for_auto_readahead = 2;
|
|
|
|
readahead_params.num_buffers = 2;
|
|
|
|
|
|
|
|
FilePrefetchBuffer fpb(readahead_params, /*enable=*/true,
|
|
|
|
/*track_min_offset=*/false, fs(), nullptr, nullptr,
|
|
|
|
nullptr, FilePrefetchBufferUsage::kUnknown);
|
2023-09-11 18:41:44 +00:00
|
|
|
|
|
|
|
Slice result;
|
|
|
|
// Simulate a seek of half of alignment bytes at offset n. Due to the
|
|
|
|
// readahead settings, it won't prefetch extra or do any alignment and
|
|
|
|
// offset of buffer will be n.
|
|
|
|
Status s = fpb.PrefetchAsync(IOOptions(), r.get(), n, n, &result);
|
|
|
|
|
|
|
|
// Platforms that don't have IO uring may not support async IO.
|
|
|
|
if (s.IsNotSupported()) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
ASSERT_TRUE(s.IsTryAgain());
|
|
|
|
|
|
|
|
IOOptions io_opts;
|
|
|
|
io_opts.rate_limiter_priority = Env::IOPriority::IO_LOW;
|
2024-01-05 17:29:01 +00:00
|
|
|
ASSERT_TRUE(fpb.TryReadFromCache(io_opts, r.get(), n, n, &result, &s));
|
2023-09-11 18:41:44 +00:00
|
|
|
|
|
|
|
if (read_async_called) {
|
|
|
|
ASSERT_EQ(fpb.GetPrefetchOffset(), n);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// With readahead enabled, it will do the alignment and prefetch and offset of
|
|
|
|
// buffer will be 0.
|
|
|
|
{
|
|
|
|
read_async_called = false;
|
2024-01-05 17:29:01 +00:00
|
|
|
ReadaheadParams readahead_params;
|
|
|
|
readahead_params.initial_readahead_size = 16384;
|
|
|
|
readahead_params.max_readahead_size = 16384;
|
|
|
|
readahead_params.num_file_reads_for_auto_readahead = 2;
|
|
|
|
readahead_params.num_buffers = 2;
|
|
|
|
FilePrefetchBuffer fpb(readahead_params, /*enable=*/true,
|
|
|
|
/*track_min_offset=*/false, fs(), nullptr, nullptr,
|
|
|
|
nullptr, FilePrefetchBufferUsage::kUnknown);
|
2023-09-11 18:41:44 +00:00
|
|
|
|
|
|
|
Slice result;
|
|
|
|
// Simulate a seek of half of alignment bytes at offset n.
|
|
|
|
Status s = fpb.PrefetchAsync(IOOptions(), r.get(), n, n, &result);
|
|
|
|
|
|
|
|
// Platforms that don't have IO uring may not support async IO.
|
|
|
|
if (s.IsNotSupported()) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
ASSERT_TRUE(s.IsTryAgain());
|
|
|
|
|
|
|
|
IOOptions io_opts;
|
|
|
|
io_opts.rate_limiter_priority = Env::IOPriority::IO_LOW;
|
2024-01-05 17:29:01 +00:00
|
|
|
ASSERT_TRUE(fpb.TryReadFromCache(io_opts, r.get(), n, n, &result, &s));
|
2023-09-11 18:41:44 +00:00
|
|
|
|
|
|
|
if (read_async_called) {
|
|
|
|
ASSERT_EQ(fpb.GetPrefetchOffset(), 0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-01-20 18:17:57 +00:00
|
|
|
TEST_F(FilePrefetchBufferTest, NoSyncWithAsyncIO) {
|
|
|
|
std::string fname = "seek-with-block-cache-hit";
|
|
|
|
Random rand(0);
|
|
|
|
std::string content = rand.RandomString(32768);
|
|
|
|
Write(fname, content);
|
|
|
|
|
|
|
|
FileOptions opts;
|
|
|
|
std::unique_ptr<RandomAccessFileReader> r;
|
|
|
|
Read(fname, opts, &r);
|
|
|
|
|
2024-01-05 17:29:01 +00:00
|
|
|
ReadaheadParams readahead_params;
|
|
|
|
readahead_params.initial_readahead_size = 8192;
|
|
|
|
readahead_params.max_readahead_size = 16384;
|
|
|
|
readahead_params.num_buffers = 2;
|
|
|
|
FilePrefetchBuffer fpb(readahead_params, /*enable=*/true,
|
|
|
|
/*track_min_offset=*/false, fs(), nullptr, nullptr,
|
|
|
|
nullptr, FilePrefetchBufferUsage::kUnknown);
|
2023-01-20 18:17:57 +00:00
|
|
|
|
|
|
|
int read_async_called = 0;
|
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"FilePrefetchBuffer::ReadAsync",
|
|
|
|
[&](void* /*arg*/) { read_async_called++; });
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
Slice async_result;
|
|
|
|
// Simulate a seek of 4000 bytes at offset 3000. Due to the readahead
|
|
|
|
// settings, it will do two reads of 4000+4096 and 4096
|
|
|
|
Status s = fpb.PrefetchAsync(IOOptions(), r.get(), 3000, 4000, &async_result);
|
2023-02-17 02:33:06 +00:00
|
|
|
|
2023-01-20 18:17:57 +00:00
|
|
|
// Platforms that don't have IO uring may not support async IO
|
2023-02-17 02:33:06 +00:00
|
|
|
if (s.IsNotSupported()) {
|
|
|
|
return;
|
|
|
|
}
|
2023-01-20 18:17:57 +00:00
|
|
|
|
2023-02-17 02:33:06 +00:00
|
|
|
ASSERT_TRUE(s.IsTryAgain());
|
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444)
Summary:
**Context/Summary:**
- Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`.
- For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation)
- New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main.
- Misc
- More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority`
- Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444
Test Plan:
- CI fake db crash/stress test
- Microbenchmarking
**Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench`
- google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd
- db_basic_bench_base: upstream
- db_basic_bench_pr: db_basic_bench_base + this PR
- asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read)
- asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR
**Test**
Get
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000
```
Result
```
Coming soon
```
AsyncRead
```
TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out
```
Result
```
Base:
1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008,
PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before):
1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053,
```
Reviewed By: ajkr
Differential Revision: D45918925
Pulled By: hx235
fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
|
|
|
IOOptions io_opts;
|
|
|
|
io_opts.rate_limiter_priority = Env::IOPriority::IO_LOW;
|
2024-01-05 17:29:01 +00:00
|
|
|
ASSERT_TRUE(fpb.TryReadFromCache(io_opts, r.get(), /*offset=*/3000,
|
|
|
|
/*length=*/4000, &async_result, &s));
|
2023-01-20 18:17:57 +00:00
|
|
|
// No sync call should be made.
|
|
|
|
HistogramData sst_read_micros;
|
|
|
|
stats()->histogramData(SST_READ_MICROS, &sst_read_micros);
|
|
|
|
ASSERT_EQ(sst_read_micros.count, 0);
|
|
|
|
|
|
|
|
// Number of async calls should be.
|
|
|
|
ASSERT_EQ(read_async_called, 2);
|
|
|
|
// Length should be 4000.
|
|
|
|
ASSERT_EQ(async_result.size(), 4000);
|
|
|
|
// Data correctness.
|
2023-09-20 23:13:20 +00:00
|
|
|
Slice result(&content[3000], 4000);
|
|
|
|
ASSERT_EQ(result.size(), 4000);
|
|
|
|
ASSERT_EQ(result, async_result);
|
|
|
|
}
|
|
|
|
|
2023-10-23 21:42:44 +00:00
|
|
|
TEST_F(FilePrefetchBufferTest, SyncReadaheadStats) {
|
|
|
|
std::string fname = "seek-with-block-cache-hit";
|
|
|
|
Random rand(0);
|
|
|
|
std::string content = rand.RandomString(32768);
|
|
|
|
Write(fname, content);
|
|
|
|
|
|
|
|
FileOptions opts;
|
|
|
|
std::unique_ptr<RandomAccessFileReader> r;
|
|
|
|
Read(fname, opts, &r);
|
|
|
|
|
|
|
|
std::shared_ptr<Statistics> stats = CreateDBStatistics();
|
2024-01-05 17:29:01 +00:00
|
|
|
ReadaheadParams readahead_params;
|
|
|
|
readahead_params.initial_readahead_size = 8192;
|
|
|
|
readahead_params.max_readahead_size = 8192;
|
|
|
|
FilePrefetchBuffer fpb(readahead_params, true, false, fs(), nullptr,
|
2023-10-23 21:42:44 +00:00
|
|
|
stats.get());
|
|
|
|
Slice result;
|
|
|
|
// Simulate a seek of 4096 bytes at offset 0. Due to the readahead settings,
|
2024-01-05 17:29:01 +00:00
|
|
|
// it will do a read of offset 0 and length - (4096 + 8192) 12288.
|
2023-10-23 21:42:44 +00:00
|
|
|
Status s;
|
|
|
|
ASSERT_TRUE(fpb.TryReadFromCache(IOOptions(), r.get(), 0, 4096, &result, &s));
|
|
|
|
ASSERT_EQ(s, Status::OK());
|
|
|
|
ASSERT_EQ(stats->getTickerCount(PREFETCH_HITS), 0);
|
|
|
|
ASSERT_EQ(stats->getTickerCount(PREFETCH_BYTES_USEFUL), 0);
|
|
|
|
|
|
|
|
// Simulate a block cache hit
|
|
|
|
fpb.UpdateReadPattern(4096, 4096, false);
|
2024-01-19 03:09:49 +00:00
|
|
|
// Now read some data that'll prefetch additional data from 12288 to 24576.
|
|
|
|
// (8192) + 8192 (readahead_size).
|
2023-10-23 21:42:44 +00:00
|
|
|
ASSERT_TRUE(
|
|
|
|
fpb.TryReadFromCache(IOOptions(), r.get(), 8192, 8192, &result, &s));
|
|
|
|
ASSERT_EQ(s, Status::OK());
|
|
|
|
ASSERT_EQ(stats->getTickerCount(PREFETCH_HITS), 0);
|
|
|
|
ASSERT_EQ(stats->getTickerCount(PREFETCH_BYTES_USEFUL), 4096);
|
|
|
|
|
|
|
|
ASSERT_TRUE(
|
|
|
|
fpb.TryReadFromCache(IOOptions(), r.get(), 12288, 4096, &result, &s));
|
|
|
|
ASSERT_EQ(s, Status::OK());
|
2024-01-19 03:09:49 +00:00
|
|
|
ASSERT_EQ(stats->getAndResetTickerCount(PREFETCH_HITS), 1);
|
|
|
|
ASSERT_EQ(stats->getAndResetTickerCount(PREFETCH_BYTES_USEFUL), 8192);
|
|
|
|
|
|
|
|
// Now read some data with length doesn't align with aligment and it needs
|
|
|
|
// prefetching. Read from 16000 with length 10000 (i.e. requested end offset -
|
|
|
|
// 26000).
|
|
|
|
ASSERT_TRUE(
|
|
|
|
fpb.TryReadFromCache(IOOptions(), r.get(), 16000, 10000, &result, &s));
|
|
|
|
ASSERT_EQ(s, Status::OK());
|
|
|
|
ASSERT_EQ(stats->getAndResetTickerCount(PREFETCH_HITS), 0);
|
|
|
|
ASSERT_EQ(
|
|
|
|
stats->getAndResetTickerCount(PREFETCH_BYTES_USEFUL),
|
|
|
|
/* 24576(end offset of the buffer) - 16000(requested offset) =*/8576);
|
2023-10-23 21:42:44 +00:00
|
|
|
}
|
|
|
|
|
2020-08-28 01:15:11 +00:00
|
|
|
} // namespace ROCKSDB_NAMESPACE
|
|
|
|
|
|
|
|
int main(int argc, char** argv) {
|
2022-10-18 07:35:35 +00:00
|
|
|
ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
|
2020-08-28 01:15:11 +00:00
|
|
|
::testing::InitGoogleTest(&argc, argv);
|
|
|
|
|
|
|
|
return RUN_ALL_TESTS();
|
|
|
|
}
|