2017-02-27 20:19:23 +00:00
|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
2017-07-15 23:03:42 +00:00
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
2017-02-27 20:19:23 +00:00
|
|
|
//
|
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
2020-03-24 01:47:34 +00:00
|
|
|
|
2020-06-29 21:51:57 +00:00
|
|
|
#include <cstring>
|
|
|
|
|
2017-02-27 20:19:23 +00:00
|
|
|
#include "db/db_test_util.h"
|
Implement XXH3 block checksum type (#9069)
Summary:
XXH3 - latest hash function that is extremely fast on large
data, easily faster than crc32c on most any x86_64 hardware. In
integrating this hash function, I have handled the compression type byte
in a non-standard way to avoid using the streaming API (extra data
movement and active code size because of hash function complexity). This
approach got a thumbs-up from Yann Collet.
Existing functionality change:
* reject bad ChecksumType in options with InvalidArgument
This change split off from https://github.com/facebook/rocksdb/issues/9058 because context-aware checksum is
likely to be handled through different configuration than ChecksumType.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9069
Test Plan:
tests updated, and substantially expanded. Unit tests now check
that we don't accidentally change the values generated by the checksum
algorithms ("schema test") and that we properly handle
invalid/unrecognized checksum types in options or in file footer.
DBTestBase::ChangeOptions (etc.) updated from two to one configuration
changing from default CRC32c ChecksumType. The point of this test code
is to detect possible interactions among features, and the likelihood of
some bad interaction being detected by including configurations other
than XXH3 and CRC32c--and then not detected by stress/crash test--is
extremely low.
Stress/crash test also updated (manual run long enough to see it accepts
new checksum type). db_bench also updated for microbenchmarking
checksums.
### Performance microbenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
./db_bench -benchmarks=crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3
crc32c : 0.200 micros/op 5005220 ops/sec; 19551.6 MB/s (4096 per op)
xxhash : 0.807 micros/op 1238408 ops/sec; 4837.5 MB/s (4096 per op)
xxhash64 : 0.421 micros/op 2376514 ops/sec; 9283.3 MB/s (4096 per op)
xxh3 : 0.171 micros/op 5858391 ops/sec; 22884.3 MB/s (4096 per op)
crc32c : 0.206 micros/op 4859566 ops/sec; 18982.7 MB/s (4096 per op)
xxhash : 0.793 micros/op 1260850 ops/sec; 4925.2 MB/s (4096 per op)
xxhash64 : 0.410 micros/op 2439182 ops/sec; 9528.1 MB/s (4096 per op)
xxh3 : 0.161 micros/op 6202872 ops/sec; 24230.0 MB/s (4096 per op)
crc32c : 0.203 micros/op 4924686 ops/sec; 19237.1 MB/s (4096 per op)
xxhash : 0.839 micros/op 1192388 ops/sec; 4657.8 MB/s (4096 per op)
xxhash64 : 0.424 micros/op 2357391 ops/sec; 9208.6 MB/s (4096 per op)
xxh3 : 0.162 micros/op 6182678 ops/sec; 24151.1 MB/s (4096 per op)
As you can see, especially once warmed up, xxh3 is fastest.
### Performance macrobenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
Test
for I in `seq 1 50`; do for CHK in 0 1 2 3 4; do TEST_TMPDIR=/dev/shm/rocksdb$CHK ./db_bench -benchmarks=fillseq -memtablerep=vector -allow_concurrent_memtable_write=false -num=30000000 -checksum_type=$CHK 2>&1 | grep 'micros/op' | tee -a results-$CHK & done; wait; done
Results (ops/sec)
for FILE in results*; do echo -n "$FILE "; awk '{ s += $5; c++; } END { print 1.0 * s / c; }' < $FILE; done
results-0 252118 # kNoChecksum
results-1 251588 # kCRC32c
results-2 251863 # kxxHash
results-3 252016 # kxxHash64
results-4 252038 # kXXH3
Reviewed By: mrambacher
Differential Revision: D31905249
Pulled By: pdillinger
fbshipit-source-id: cb9b998ebe2523fc7c400eedf62124a78bf4b4d1
2021-10-29 05:13:47 +00:00
|
|
|
#include "options/options_helper.h"
|
2017-02-27 20:19:23 +00:00
|
|
|
#include "port/stack_trace.h"
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
#include "rocksdb/filter_policy.h"
|
Fix many tests to run with MEM_ENV and ENCRYPTED_ENV; Introduce a MemoryFileSystem class (#7566)
Summary:
This PR does a few things:
1. The MockFileSystem class was split out from the MockEnv. This change would theoretically allow a MockFileSystem to be used by other Environments as well (if we created a means of constructing one). The MockFileSystem implements a FileSystem in its entirety and does not rely on any Wrapper implementation.
2. Make the RocksDB test suite work when MOCK_ENV=1 and ENCRYPTED_ENV=1 are set. To accomplish this, a few things were needed:
- The tests that tried to use the "wrong" environment (Env::Default() instead of env_) were updated
- The MockFileSystem was changed to support the features it was missing or mishandled (such as recursively deleting files in a directory or supporting renaming of a directory).
3. Updated the test framework to have a ROCKSDB_GTEST_SKIP macro. This can be used to flag tests that are skipped. Currently, this defaults to doing nothing (marks the test as SUCCESS) but will mark the tests as SKIPPED when RocksDB is upgraded to a version of gtest that supports this (gtest-1.10).
I have run a full "make check" with MEM_ENV, ENCRYPTED_ENV, both, and neither under both MacOS and RedHat. A few tests were disabled/skipped for the MEM/ENCRYPTED cases. The error_handler_fs_test fails/hangs for MEM_ENV (presumably a timing problem) and I will introduce another PR/issue to track that problem. (I will also push a change to disable those tests soon). There is one more test in DBTest2 that also fails which I need to investigate or skip before this PR is merged.
Theoretically, this PR should also allow the test suite to run against an Env loaded from the registry, though I do not have one to try it with currently.
Finally, once this is accepted, it would be nice if there was a CircleCI job to run these tests on a checkin so this effort does not become stale. I do not know how to do that, so if someone could write that job, it would be appreciated :)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7566
Reviewed By: zhichao-cao
Differential Revision: D24408980
Pulled By: jay-zhuang
fbshipit-source-id: 911b1554a4d0da06fd51feca0c090a4abdcb4a5f
2020-10-27 17:31:34 +00:00
|
|
|
#include "rocksdb/flush_block_policy.h"
|
2020-06-08 23:08:31 +00:00
|
|
|
#include "rocksdb/merge_operator.h"
|
2017-02-27 20:19:23 +00:00
|
|
|
#include "rocksdb/perf_context.h"
|
2021-12-10 16:12:09 +00:00
|
|
|
#include "rocksdb/table.h"
|
2019-07-08 05:40:52 +00:00
|
|
|
#include "rocksdb/utilities/debug.h"
|
2019-11-12 00:57:49 +00:00
|
|
|
#include "table/block_based/block_based_table_reader.h"
|
2019-07-01 03:52:34 +00:00
|
|
|
#include "table/block_based/block_builder.h"
|
2019-05-30 18:21:38 +00:00
|
|
|
#include "test_util/sync_point.h"
|
2021-01-04 19:50:29 +00:00
|
|
|
#include "util/file_checksum_helper.h"
|
2020-07-09 21:33:42 +00:00
|
|
|
#include "util/random.h"
|
Explicitly closing all directory file descriptors (#10049)
Summary:
Currently, the DB directory file descriptor is left open until the deconstruction process (`DB::Close()` does not close the file descriptor). To verify this, comment out the lines between `db_ = nullptr` and `db_->Close()` (line 512, 513, 514, 515 in ldb_cmd.cc) to leak the ``db_'' object, build `ldb` tool and run
```
strace --trace=open,openat,close ./ldb --db=$TEST_TMPDIR --ignore_unknown_options put K1 V1 --create_if_missing
```
There is one directory file descriptor that is not closed in the strace log.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10049
Test Plan: Add a new unit test DBBasicTest.DBCloseAllDirectoryFDs: Open a database with different WAL directory and three different data directories, and all directory file descriptors should be closed after calling Close(). Explicitly call Close() after a directory file descriptor is not used so that the counter of directory open and close should be equivalent.
Reviewed By: ajkr, hx235
Differential Revision: D36722135
Pulled By: littlepig2013
fbshipit-source-id: 07bdc2abc417c6b30997b9bbef1f79aa757b21ff
2022-06-02 01:03:34 +00:00
|
|
|
#include "utilities/counted_fs.h"
|
2020-07-09 21:33:42 +00:00
|
|
|
#include "utilities/fault_injection_env.h"
|
2020-06-08 23:08:31 +00:00
|
|
|
#include "utilities/merge_operators.h"
|
|
|
|
#include "utilities/merge_operators/string_append/stringappend.h"
|
2017-02-27 20:19:23 +00:00
|
|
|
|
2020-02-20 20:07:53 +00:00
|
|
|
namespace ROCKSDB_NAMESPACE {
|
2017-02-27 20:19:23 +00:00
|
|
|
|
2023-02-03 00:35:27 +00:00
|
|
|
static bool enable_io_uring = true;
|
|
|
|
extern "C" bool RocksDbIOUringEnable() { return enable_io_uring; }
|
|
|
|
|
2017-02-27 20:19:23 +00:00
|
|
|
class DBBasicTest : public DBTestBase {
|
|
|
|
public:
|
2021-07-23 15:37:27 +00:00
|
|
|
DBBasicTest() : DBTestBase("db_basic_test", /*env_do_fsync=*/false) {}
|
2017-02-27 20:19:23 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
TEST_F(DBBasicTest, OpenWhenOpen) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.env = env_;
|
2020-09-29 16:47:33 +00:00
|
|
|
DB* db2 = nullptr;
|
|
|
|
Status s = DB::Open(options, dbname_, &db2);
|
Clean up after two test failures in db_basic_test (#7682)
Summary:
In db_basic_test.cc, there are two tests that rely on the underlying
system's `LockFile` support to function correctly:
DBBasicTest.OpenWhenOpen and DBBasicTest.CheckLock. In both tests,
re-opening a db using `DB::Open` is expected to fail because the second
open cannot lock the LOCK file. Some distributed file systems, e.g. HDFS
do not support the POSIX-style file lock. Therefore, these unit tests will cause
assertion failure and the second `Open` will create a db instance.
Currently, these db instances are not closed after the assertion
failure. Since these db instances are registered with some process-wide, static
data structures, e.g. `PeriodicWorkScheduler::Default()`, they can still be
accessed after the unit tests. However, the `Env` object created for this db
instance is destroyed when the test finishes in `~DBTestBase()`. Consequently,
it causes illegal memory access.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7682
Test Plan:
Run the following on a distrubited file system:
```
make check
```
Reviewed By: anand1976
Differential Revision: D25004215
Pulled By: riversand963
fbshipit-source-id: f4327d7716c0e72b13bb43737ec9a5d156da4d52
2020-11-17 06:07:29 +00:00
|
|
|
ASSERT_NOK(s) << [db2]() {
|
|
|
|
delete db2;
|
|
|
|
return "db2 open: ok";
|
|
|
|
}();
|
2017-02-27 20:19:23 +00:00
|
|
|
ASSERT_EQ(Status::Code::kIOError, s.code());
|
|
|
|
ASSERT_EQ(Status::SubCode::kNone, s.subcode());
|
|
|
|
ASSERT_TRUE(strstr(s.getState(), "lock ") != nullptr);
|
|
|
|
|
|
|
|
delete db2;
|
|
|
|
}
|
|
|
|
|
2022-01-28 03:29:17 +00:00
|
|
|
TEST_F(DBBasicTest, EnableDirectIOWithZeroBuf) {
|
|
|
|
if (!IsDirectIOSupported()) {
|
|
|
|
ROCKSDB_GTEST_BYPASS("Direct IO not supported");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
Options options = GetDefaultOptions();
|
|
|
|
options.create_if_missing = true;
|
|
|
|
options.use_direct_io_for_flush_and_compaction = true;
|
|
|
|
options.writable_file_max_buffer_size = 0;
|
|
|
|
ASSERT_TRUE(TryReopen(options).IsInvalidArgument());
|
|
|
|
|
|
|
|
options.writable_file_max_buffer_size = 1024;
|
|
|
|
Reopen(options);
|
|
|
|
const std::unordered_map<std::string, std::string> new_db_opts = {
|
|
|
|
{"writable_file_max_buffer_size", "0"}};
|
|
|
|
ASSERT_TRUE(db_->SetDBOptions(new_db_opts).IsInvalidArgument());
|
|
|
|
}
|
|
|
|
|
2020-06-15 17:45:03 +00:00
|
|
|
TEST_F(DBBasicTest, UniqueSession) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
std::string sid1, sid2, sid3, sid4;
|
|
|
|
|
2020-09-29 16:47:33 +00:00
|
|
|
ASSERT_OK(db_->GetDbSessionId(sid1));
|
2020-06-15 17:45:03 +00:00
|
|
|
Reopen(options);
|
2020-09-29 16:47:33 +00:00
|
|
|
ASSERT_OK(db_->GetDbSessionId(sid2));
|
2020-06-15 17:45:03 +00:00
|
|
|
ASSERT_OK(Put("foo", "v1"));
|
2020-09-29 16:47:33 +00:00
|
|
|
ASSERT_OK(db_->GetDbSessionId(sid4));
|
2020-06-15 17:45:03 +00:00
|
|
|
Reopen(options);
|
2020-09-29 16:47:33 +00:00
|
|
|
ASSERT_OK(db_->GetDbSessionId(sid3));
|
2020-06-15 17:45:03 +00:00
|
|
|
|
|
|
|
ASSERT_NE(sid1, sid2);
|
|
|
|
ASSERT_NE(sid1, sid3);
|
|
|
|
ASSERT_NE(sid2, sid3);
|
|
|
|
|
|
|
|
ASSERT_EQ(sid2, sid4);
|
|
|
|
|
Restore file size in backup table file names (and other cleanup) (#7400)
Summary:
Prior to 6.12, backup files using share_files_with_checksum had
the file size encoded in the file name, after the last '\_' and before
the last '.'. We considered this an implementation detail subject to
change, and indeed removed this information from the file name (with an
option to use old behavior) because it was considered
ineffective/inefficient for file name uniqueness. However, some
downstream RocksDB users were relying on this information since the file
size is not explicitly in the backup manifest file.
This primary purpose of this change is "retrofitting" the 6.12 release
(not yet a public release) to simultaneously support the benefits of the
new naming scheme (I/O performance and data correctness at scale) and
preserve the file size information, both as default behaviors. With this
change, we are essentially making the file size information encoded in
the file name an official, though obscure, extension of the backup meta
file format.
We preserve an option (kLegacyCrc32cAndFileSize) to use the original
"legacy" naming scheme, with its caveats, and make it easy to omit the
file size information (no kFlagIncludeFileSize), for more compact file
names. But note that changing the naming scheme used on an existing db
and backup directory can lead to transient space amplification, as some
files will be stored under two names in the shared_checksum directory.
Because some backups were saved using the original 6.12 naming scheme,
we offer two ways of dealing with those files: SST files generated by
older 6.12 versions can either use the default naming scheme in effect
when the SST files were generated (kFlagMatchInterimNaming, default, no
transient space amplification) or can use a new naming scheme (no
kFlagMatchInterimNaming, potential space amplification because some
already stored files getting a new name).
We don't have a natural way to detect which files were generated by
previous 6.12 versions, but this change hacks one in by changing DB
session ids to now use a more concise encoding, reducing file name
length, saving ~dozen bytes from SST files, and making them visually
distinct from DB ids so that they are less likely to be mixed up.
Two final auxiliary notes:
Recognizing that the backup file names have become a de facto part of
the backup meta schema, this change makes them easier to parse and
extend by putting a distinct marker, 's', before DB session ids embedded
in the name. When we extend this to allow custom checksums in the name,
they can get their own marker to ensure safe parsing. For backward
compatibility, file size does not get a marker but is assumed for
`_[0-9]+[.]`
Another change from initial 6.12 default behavior is never including
file custom checksum in the file name. Looking ahead to 6.13, we do not
want the default behavior to cause backup space amplification for
someone turning on file custom checksum checking in BackupEngine; we
want that to be an easy decision. When implemented, including file
custom checksums in backup file names will be a non-default option.
Actual file name patterns and priorities, as regexes:
kLegacyCrc32cAndFileSize OR pre-6.12 SST file ->
[0-9]+_[0-9]+_[0-9]+[.]sst
kFlagMatchInterimNaming set (default) AND early 6.12 SST file ->
[0-9]+_[0-9a-fA-F-]+[.]sst
kUseDbSessionId AND NOT kFlagIncludeFileSize ->
[0-9]+_s[0-9A-Z]{20}[.]sst
kUseDbSessionId AND kFlagIncludeFileSize (default) ->
[0-9]+_s[0-9A-Z]{20}_[0-9]+[.]sst
We might add opt-in options for more '\_' separated data in the name,
but embedded file size, if present, will always be after last '\_' and
before '.sst'.
This change was originally applied to version 6.12. (See https://github.com/facebook/rocksdb/issues/7390)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7400
Test Plan:
unit tests included. Sync point callbacks are used to mimic
previous version SST files.
Reviewed By: ajkr
Differential Revision: D23759587
Pulled By: pdillinger
fbshipit-source-id: f62d8af4e0978de0a34f26288cfbe66049b70025
2020-09-17 17:22:56 +00:00
|
|
|
// Expected compact format for session ids (see notes in implementation)
|
2021-09-07 20:04:07 +00:00
|
|
|
TestRegex expected("[0-9A-Z]{20}");
|
|
|
|
EXPECT_MATCHES_REGEX(sid1, expected);
|
|
|
|
EXPECT_MATCHES_REGEX(sid2, expected);
|
|
|
|
EXPECT_MATCHES_REGEX(sid3, expected);
|
Restore file size in backup table file names (and other cleanup) (#7400)
Summary:
Prior to 6.12, backup files using share_files_with_checksum had
the file size encoded in the file name, after the last '\_' and before
the last '.'. We considered this an implementation detail subject to
change, and indeed removed this information from the file name (with an
option to use old behavior) because it was considered
ineffective/inefficient for file name uniqueness. However, some
downstream RocksDB users were relying on this information since the file
size is not explicitly in the backup manifest file.
This primary purpose of this change is "retrofitting" the 6.12 release
(not yet a public release) to simultaneously support the benefits of the
new naming scheme (I/O performance and data correctness at scale) and
preserve the file size information, both as default behaviors. With this
change, we are essentially making the file size information encoded in
the file name an official, though obscure, extension of the backup meta
file format.
We preserve an option (kLegacyCrc32cAndFileSize) to use the original
"legacy" naming scheme, with its caveats, and make it easy to omit the
file size information (no kFlagIncludeFileSize), for more compact file
names. But note that changing the naming scheme used on an existing db
and backup directory can lead to transient space amplification, as some
files will be stored under two names in the shared_checksum directory.
Because some backups were saved using the original 6.12 naming scheme,
we offer two ways of dealing with those files: SST files generated by
older 6.12 versions can either use the default naming scheme in effect
when the SST files were generated (kFlagMatchInterimNaming, default, no
transient space amplification) or can use a new naming scheme (no
kFlagMatchInterimNaming, potential space amplification because some
already stored files getting a new name).
We don't have a natural way to detect which files were generated by
previous 6.12 versions, but this change hacks one in by changing DB
session ids to now use a more concise encoding, reducing file name
length, saving ~dozen bytes from SST files, and making them visually
distinct from DB ids so that they are less likely to be mixed up.
Two final auxiliary notes:
Recognizing that the backup file names have become a de facto part of
the backup meta schema, this change makes them easier to parse and
extend by putting a distinct marker, 's', before DB session ids embedded
in the name. When we extend this to allow custom checksums in the name,
they can get their own marker to ensure safe parsing. For backward
compatibility, file size does not get a marker but is assumed for
`_[0-9]+[.]`
Another change from initial 6.12 default behavior is never including
file custom checksum in the file name. Looking ahead to 6.13, we do not
want the default behavior to cause backup space amplification for
someone turning on file custom checksum checking in BackupEngine; we
want that to be an easy decision. When implemented, including file
custom checksums in backup file names will be a non-default option.
Actual file name patterns and priorities, as regexes:
kLegacyCrc32cAndFileSize OR pre-6.12 SST file ->
[0-9]+_[0-9]+_[0-9]+[.]sst
kFlagMatchInterimNaming set (default) AND early 6.12 SST file ->
[0-9]+_[0-9a-fA-F-]+[.]sst
kUseDbSessionId AND NOT kFlagIncludeFileSize ->
[0-9]+_s[0-9A-Z]{20}[.]sst
kUseDbSessionId AND kFlagIncludeFileSize (default) ->
[0-9]+_s[0-9A-Z]{20}_[0-9]+[.]sst
We might add opt-in options for more '\_' separated data in the name,
but embedded file size, if present, will always be after last '\_' and
before '.sst'.
This change was originally applied to version 6.12. (See https://github.com/facebook/rocksdb/issues/7390)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7400
Test Plan:
unit tests included. Sync point callbacks are used to mimic
previous version SST files.
Reviewed By: ajkr
Differential Revision: D23759587
Pulled By: pdillinger
fbshipit-source-id: f62d8af4e0978de0a34f26288cfbe66049b70025
2020-09-17 17:22:56 +00:00
|
|
|
|
2020-06-15 17:45:03 +00:00
|
|
|
Close();
|
|
|
|
ASSERT_OK(ReadOnlyReopen(options));
|
2020-09-29 16:47:33 +00:00
|
|
|
ASSERT_OK(db_->GetDbSessionId(sid1));
|
2020-06-15 17:45:03 +00:00
|
|
|
// Test uniqueness between readonly open (sid1) and regular open (sid3)
|
|
|
|
ASSERT_NE(sid1, sid3);
|
|
|
|
Close();
|
|
|
|
ASSERT_OK(ReadOnlyReopen(options));
|
2020-09-29 16:47:33 +00:00
|
|
|
ASSERT_OK(db_->GetDbSessionId(sid2));
|
2020-06-15 17:45:03 +00:00
|
|
|
ASSERT_EQ("v1", Get("foo"));
|
2020-09-29 16:47:33 +00:00
|
|
|
ASSERT_OK(db_->GetDbSessionId(sid3));
|
2020-06-15 17:45:03 +00:00
|
|
|
|
|
|
|
ASSERT_NE(sid1, sid2);
|
|
|
|
|
|
|
|
ASSERT_EQ(sid2, sid3);
|
|
|
|
|
|
|
|
CreateAndReopenWithCF({"goku"}, options);
|
2020-09-29 16:47:33 +00:00
|
|
|
ASSERT_OK(db_->GetDbSessionId(sid1));
|
2020-06-15 17:45:03 +00:00
|
|
|
ASSERT_OK(Put("bar", "e1"));
|
2020-09-29 16:47:33 +00:00
|
|
|
ASSERT_OK(db_->GetDbSessionId(sid2));
|
2020-06-15 17:45:03 +00:00
|
|
|
ASSERT_EQ("e1", Get("bar"));
|
2020-09-29 16:47:33 +00:00
|
|
|
ASSERT_OK(db_->GetDbSessionId(sid3));
|
2020-06-15 17:45:03 +00:00
|
|
|
ReopenWithColumnFamilies({"default", "goku"}, options);
|
2020-09-29 16:47:33 +00:00
|
|
|
ASSERT_OK(db_->GetDbSessionId(sid4));
|
2020-06-15 17:45:03 +00:00
|
|
|
|
|
|
|
ASSERT_EQ(sid1, sid2);
|
|
|
|
ASSERT_EQ(sid2, sid3);
|
|
|
|
|
|
|
|
ASSERT_NE(sid1, sid4);
|
|
|
|
}
|
|
|
|
|
2017-02-27 20:19:23 +00:00
|
|
|
TEST_F(DBBasicTest, ReadOnlyDB) {
|
|
|
|
ASSERT_OK(Put("foo", "v1"));
|
|
|
|
ASSERT_OK(Put("bar", "v2"));
|
|
|
|
ASSERT_OK(Put("foo", "v3"));
|
|
|
|
Close();
|
|
|
|
|
2020-08-03 22:21:56 +00:00
|
|
|
auto verify_one_iter = [&](Iterator* iter) {
|
|
|
|
int count = 0;
|
|
|
|
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
++count;
|
|
|
|
}
|
|
|
|
// Always expect two keys: "foo" and "bar"
|
|
|
|
ASSERT_EQ(count, 2);
|
|
|
|
};
|
|
|
|
|
|
|
|
auto verify_all_iters = [&]() {
|
|
|
|
Iterator* iter = db_->NewIterator(ReadOptions());
|
|
|
|
verify_one_iter(iter);
|
|
|
|
delete iter;
|
|
|
|
|
|
|
|
std::vector<Iterator*> iters;
|
|
|
|
ASSERT_OK(db_->NewIterators(ReadOptions(),
|
|
|
|
{dbfull()->DefaultColumnFamily()}, &iters));
|
|
|
|
ASSERT_EQ(static_cast<uint64_t>(1), iters.size());
|
|
|
|
verify_one_iter(iters[0]);
|
|
|
|
delete iters[0];
|
|
|
|
};
|
|
|
|
|
2017-02-27 20:19:23 +00:00
|
|
|
auto options = CurrentOptions();
|
2017-10-19 17:48:47 +00:00
|
|
|
assert(options.env == env_);
|
2017-02-27 20:19:23 +00:00
|
|
|
ASSERT_OK(ReadOnlyReopen(options));
|
|
|
|
ASSERT_EQ("v3", Get("foo"));
|
|
|
|
ASSERT_EQ("v2", Get("bar"));
|
2020-08-03 22:21:56 +00:00
|
|
|
verify_all_iters();
|
2017-02-27 20:19:23 +00:00
|
|
|
Close();
|
|
|
|
|
|
|
|
// Reopen and flush memtable.
|
|
|
|
Reopen(options);
|
2020-12-24 00:54:05 +00:00
|
|
|
ASSERT_OK(Flush());
|
2017-02-27 20:19:23 +00:00
|
|
|
Close();
|
|
|
|
// Now check keys in read only mode.
|
|
|
|
ASSERT_OK(ReadOnlyReopen(options));
|
|
|
|
ASSERT_EQ("v3", Get("foo"));
|
|
|
|
ASSERT_EQ("v2", Get("bar"));
|
2020-08-03 22:21:56 +00:00
|
|
|
verify_all_iters();
|
2017-02-27 20:19:23 +00:00
|
|
|
ASSERT_TRUE(db_->SyncWAL().IsNotSupported());
|
|
|
|
}
|
|
|
|
|
Remove corrupted WAL files in kPointRecoveryMode with avoid_flush_duing_recovery set true (#9634)
Summary:
1) In case of non-TransactionDB and avoid_flush_during_recovery = true, RocksDB won't
flush the data from WAL to L0 for all column families if possible. As a
result, not all column families can increase their log_numbers, and
min_log_number_to_keep won't change.
2) For transaction DB (.allow_2pc), even with the flush, there may be old WAL files that it must not delete because they can contain data of uncommitted transactions and min_log_number_to_keep won't change.
If we persist a new MANIFEST with
advanced log_numbers for some column families, then during a second
crash after persisting the MANIFEST, RocksDB will see some column
families' log_numbers larger than the corrupted wal, and the "column family inconsistency" error will be hit, causing recovery to fail.
As a solution,
1. the corrupted WALs whose numbers are larger than the
corrupted wal and smaller than the new WAL will be moved to archive folder.
2. Currently, RocksDB DB::Open() may creates and writes to two new MANIFEST files even before recovery succeeds. This PR buffers the edits in a structure and writes to a new MANIFEST after recovery is successful
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9634
Test Plan:
1. Added new unit tests
2. make crast_test -j
Reviewed By: riversand963
Differential Revision: D34463666
Pulled By: akankshamahajan15
fbshipit-source-id: e233d3af0ed4e2028ca0cf051e5a334a0fdc9d19
2022-04-11 22:39:31 +00:00
|
|
|
// TODO akanksha: Update the test to check that combination
|
|
|
|
// does not actually write to FS (use open read-only with
|
|
|
|
// CompositeEnvWrapper+ReadOnlyFileSystem).
|
|
|
|
TEST_F(DBBasicTest, DISABLED_ReadOnlyDBWithWriteDBIdToManifestSet) {
|
2019-09-03 15:50:47 +00:00
|
|
|
ASSERT_OK(Put("foo", "v1"));
|
|
|
|
ASSERT_OK(Put("bar", "v2"));
|
|
|
|
ASSERT_OK(Put("foo", "v3"));
|
|
|
|
Close();
|
|
|
|
|
|
|
|
auto options = CurrentOptions();
|
|
|
|
options.write_dbid_to_manifest = true;
|
|
|
|
assert(options.env == env_);
|
|
|
|
ASSERT_OK(ReadOnlyReopen(options));
|
|
|
|
std::string db_id1;
|
2020-09-29 16:47:33 +00:00
|
|
|
ASSERT_OK(db_->GetDbIdentity(db_id1));
|
2019-09-03 15:50:47 +00:00
|
|
|
ASSERT_EQ("v3", Get("foo"));
|
|
|
|
ASSERT_EQ("v2", Get("bar"));
|
|
|
|
Iterator* iter = db_->NewIterator(ReadOptions());
|
|
|
|
int count = 0;
|
|
|
|
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
++count;
|
|
|
|
}
|
|
|
|
ASSERT_EQ(count, 2);
|
|
|
|
delete iter;
|
|
|
|
Close();
|
|
|
|
|
|
|
|
// Reopen and flush memtable.
|
|
|
|
Reopen(options);
|
2020-12-24 00:54:05 +00:00
|
|
|
ASSERT_OK(Flush());
|
2019-09-03 15:50:47 +00:00
|
|
|
Close();
|
|
|
|
// Now check keys in read only mode.
|
|
|
|
ASSERT_OK(ReadOnlyReopen(options));
|
|
|
|
ASSERT_EQ("v3", Get("foo"));
|
|
|
|
ASSERT_EQ("v2", Get("bar"));
|
|
|
|
ASSERT_TRUE(db_->SyncWAL().IsNotSupported());
|
|
|
|
std::string db_id2;
|
2020-09-29 16:47:33 +00:00
|
|
|
ASSERT_OK(db_->GetDbIdentity(db_id2));
|
2019-09-03 15:50:47 +00:00
|
|
|
ASSERT_EQ(db_id1, db_id2);
|
|
|
|
}
|
|
|
|
|
2017-02-27 20:19:23 +00:00
|
|
|
TEST_F(DBBasicTest, CompactedDB) {
|
|
|
|
const uint64_t kFileSize = 1 << 20;
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.disable_auto_compactions = true;
|
|
|
|
options.write_buffer_size = kFileSize;
|
|
|
|
options.target_file_size_base = kFileSize;
|
|
|
|
options.max_bytes_for_level_base = 1 << 30;
|
|
|
|
options.compression = kNoCompression;
|
|
|
|
Reopen(options);
|
|
|
|
// 1 L0 file, use CompactedDB if max_open_files = -1
|
|
|
|
ASSERT_OK(Put("aaa", DummyString(kFileSize / 2, '1')));
|
2020-12-24 00:54:05 +00:00
|
|
|
ASSERT_OK(Flush());
|
2017-02-27 20:19:23 +00:00
|
|
|
Close();
|
|
|
|
ASSERT_OK(ReadOnlyReopen(options));
|
|
|
|
Status s = Put("new", "value");
|
|
|
|
ASSERT_EQ(s.ToString(),
|
|
|
|
"Not implemented: Not supported operation in read only mode.");
|
|
|
|
ASSERT_EQ(DummyString(kFileSize / 2, '1'), Get("aaa"));
|
|
|
|
Close();
|
|
|
|
options.max_open_files = -1;
|
|
|
|
ASSERT_OK(ReadOnlyReopen(options));
|
|
|
|
s = Put("new", "value");
|
|
|
|
ASSERT_EQ(s.ToString(),
|
|
|
|
"Not implemented: Not supported in compacted db mode.");
|
|
|
|
ASSERT_EQ(DummyString(kFileSize / 2, '1'), Get("aaa"));
|
|
|
|
Close();
|
|
|
|
Reopen(options);
|
|
|
|
// Add more L0 files
|
|
|
|
ASSERT_OK(Put("bbb", DummyString(kFileSize / 2, '2')));
|
2020-12-24 00:54:05 +00:00
|
|
|
ASSERT_OK(Flush());
|
2017-02-27 20:19:23 +00:00
|
|
|
ASSERT_OK(Put("aaa", DummyString(kFileSize / 2, 'a')));
|
2020-12-24 00:54:05 +00:00
|
|
|
ASSERT_OK(Flush());
|
2017-02-27 20:19:23 +00:00
|
|
|
ASSERT_OK(Put("bbb", DummyString(kFileSize / 2, 'b')));
|
|
|
|
ASSERT_OK(Put("eee", DummyString(kFileSize / 2, 'e')));
|
2020-12-24 00:54:05 +00:00
|
|
|
ASSERT_OK(Flush());
|
2022-08-29 20:36:23 +00:00
|
|
|
ASSERT_OK(Put("something_not_flushed", "x"));
|
2017-02-27 20:19:23 +00:00
|
|
|
Close();
|
|
|
|
|
|
|
|
ASSERT_OK(ReadOnlyReopen(options));
|
|
|
|
// Fallback to read-only DB
|
|
|
|
s = Put("new", "value");
|
|
|
|
ASSERT_EQ(s.ToString(),
|
|
|
|
"Not implemented: Not supported operation in read only mode.");
|
2022-08-29 20:36:23 +00:00
|
|
|
|
|
|
|
// TODO: validate that other write ops return NotImplemented
|
|
|
|
// (DBImplReadOnly is missing some overrides)
|
|
|
|
|
|
|
|
// Ensure no deadlock on flush triggered by another API function
|
|
|
|
// (Old deadlock bug depends on something_not_flushed above.)
|
|
|
|
std::vector<std::string> files;
|
|
|
|
uint64_t manifest_file_size;
|
|
|
|
ASSERT_OK(db_->GetLiveFiles(files, &manifest_file_size, /*flush*/ true));
|
|
|
|
LiveFilesStorageInfoOptions lfsi_opts;
|
|
|
|
lfsi_opts.wal_size_for_flush = 0; // always
|
|
|
|
std::vector<LiveFileStorageInfo> files2;
|
|
|
|
ASSERT_OK(db_->GetLiveFilesStorageInfo(lfsi_opts, &files2));
|
|
|
|
|
2017-02-27 20:19:23 +00:00
|
|
|
Close();
|
|
|
|
|
|
|
|
// Full compaction
|
|
|
|
Reopen(options);
|
|
|
|
// Add more keys
|
|
|
|
ASSERT_OK(Put("fff", DummyString(kFileSize / 2, 'f')));
|
|
|
|
ASSERT_OK(Put("hhh", DummyString(kFileSize / 2, 'h')));
|
|
|
|
ASSERT_OK(Put("iii", DummyString(kFileSize / 2, 'i')));
|
|
|
|
ASSERT_OK(Put("jjj", DummyString(kFileSize / 2, 'j')));
|
2020-09-29 16:47:33 +00:00
|
|
|
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
|
2017-02-27 20:19:23 +00:00
|
|
|
ASSERT_EQ(3, NumTableFilesAtLevel(1));
|
|
|
|
Close();
|
|
|
|
|
|
|
|
// CompactedDB
|
|
|
|
ASSERT_OK(ReadOnlyReopen(options));
|
|
|
|
s = Put("new", "value");
|
|
|
|
ASSERT_EQ(s.ToString(),
|
|
|
|
"Not implemented: Not supported in compacted db mode.");
|
|
|
|
ASSERT_EQ("NOT_FOUND", Get("abc"));
|
|
|
|
ASSERT_EQ(DummyString(kFileSize / 2, 'a'), Get("aaa"));
|
|
|
|
ASSERT_EQ(DummyString(kFileSize / 2, 'b'), Get("bbb"));
|
|
|
|
ASSERT_EQ("NOT_FOUND", Get("ccc"));
|
|
|
|
ASSERT_EQ(DummyString(kFileSize / 2, 'e'), Get("eee"));
|
|
|
|
ASSERT_EQ(DummyString(kFileSize / 2, 'f'), Get("fff"));
|
|
|
|
ASSERT_EQ("NOT_FOUND", Get("ggg"));
|
|
|
|
ASSERT_EQ(DummyString(kFileSize / 2, 'h'), Get("hhh"));
|
|
|
|
ASSERT_EQ(DummyString(kFileSize / 2, 'i'), Get("iii"));
|
|
|
|
ASSERT_EQ(DummyString(kFileSize / 2, 'j'), Get("jjj"));
|
|
|
|
ASSERT_EQ("NOT_FOUND", Get("kkk"));
|
|
|
|
|
2022-08-29 20:36:23 +00:00
|
|
|
// TODO: validate that other write ops return NotImplemented
|
|
|
|
// (CompactedDB is missing some overrides)
|
|
|
|
|
|
|
|
// Ensure no deadlock on flush triggered by another API function
|
|
|
|
ASSERT_OK(db_->GetLiveFiles(files, &manifest_file_size, /*flush*/ true));
|
|
|
|
ASSERT_OK(db_->GetLiveFilesStorageInfo(lfsi_opts, &files2));
|
|
|
|
|
2017-02-27 20:19:23 +00:00
|
|
|
// MultiGet
|
|
|
|
std::vector<std::string> values;
|
|
|
|
std::vector<Status> status_list = dbfull()->MultiGet(
|
|
|
|
ReadOptions(),
|
|
|
|
std::vector<Slice>({Slice("aaa"), Slice("ccc"), Slice("eee"),
|
|
|
|
Slice("ggg"), Slice("iii"), Slice("kkk")}),
|
|
|
|
&values);
|
|
|
|
ASSERT_EQ(status_list.size(), static_cast<uint64_t>(6));
|
|
|
|
ASSERT_EQ(values.size(), static_cast<uint64_t>(6));
|
|
|
|
ASSERT_OK(status_list[0]);
|
|
|
|
ASSERT_EQ(DummyString(kFileSize / 2, 'a'), values[0]);
|
|
|
|
ASSERT_TRUE(status_list[1].IsNotFound());
|
|
|
|
ASSERT_OK(status_list[2]);
|
|
|
|
ASSERT_EQ(DummyString(kFileSize / 2, 'e'), values[2]);
|
|
|
|
ASSERT_TRUE(status_list[3].IsNotFound());
|
|
|
|
ASSERT_OK(status_list[4]);
|
|
|
|
ASSERT_EQ(DummyString(kFileSize / 2, 'i'), values[4]);
|
|
|
|
ASSERT_TRUE(status_list[5].IsNotFound());
|
|
|
|
|
|
|
|
Reopen(options);
|
|
|
|
// Add a key
|
|
|
|
ASSERT_OK(Put("fff", DummyString(kFileSize / 2, 'f')));
|
|
|
|
Close();
|
|
|
|
ASSERT_OK(ReadOnlyReopen(options));
|
|
|
|
s = Put("new", "value");
|
|
|
|
ASSERT_EQ(s.ToString(),
|
|
|
|
"Not implemented: Not supported operation in read only mode.");
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBBasicTest, LevelLimitReopen) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
CreateAndReopenWithCF({"pikachu"}, options);
|
|
|
|
|
|
|
|
const std::string value(1024 * 1024, ' ');
|
|
|
|
int i = 0;
|
|
|
|
while (NumTableFilesAtLevel(2, 1) == 0) {
|
|
|
|
ASSERT_OK(Put(1, Key(i++), value));
|
2020-09-29 16:47:33 +00:00
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
|
|
|
|
ASSERT_OK(dbfull()->TEST_WaitForCompact());
|
2017-02-27 20:19:23 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
options.num_levels = 1;
|
|
|
|
options.max_bytes_for_level_multiplier_additional.resize(1, 1);
|
|
|
|
Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, options);
|
|
|
|
ASSERT_EQ(s.IsInvalidArgument(), true);
|
|
|
|
ASSERT_EQ(s.ToString(),
|
|
|
|
"Invalid argument: db has more levels than options.num_levels");
|
|
|
|
|
|
|
|
options.num_levels = 10;
|
|
|
|
options.max_bytes_for_level_multiplier_additional.resize(10, 1);
|
|
|
|
ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options));
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBBasicTest, PutDeleteGet) {
|
|
|
|
do {
|
|
|
|
CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
|
|
|
|
ASSERT_OK(Put(1, "foo", "v1"));
|
|
|
|
ASSERT_EQ("v1", Get(1, "foo"));
|
|
|
|
ASSERT_OK(Put(1, "foo", "v2"));
|
|
|
|
ASSERT_EQ("v2", Get(1, "foo"));
|
|
|
|
ASSERT_OK(Delete(1, "foo"));
|
|
|
|
ASSERT_EQ("NOT_FOUND", Get(1, "foo"));
|
|
|
|
} while (ChangeOptions());
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBBasicTest, PutSingleDeleteGet) {
|
|
|
|
do {
|
|
|
|
CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
|
|
|
|
ASSERT_OK(Put(1, "foo", "v1"));
|
|
|
|
ASSERT_EQ("v1", Get(1, "foo"));
|
|
|
|
ASSERT_OK(Put(1, "foo2", "v2"));
|
|
|
|
ASSERT_EQ("v2", Get(1, "foo2"));
|
|
|
|
ASSERT_OK(SingleDelete(1, "foo"));
|
|
|
|
ASSERT_EQ("NOT_FOUND", Get(1, "foo"));
|
2019-02-08 00:06:40 +00:00
|
|
|
// Ski FIFO and universal compaction because they do not apply to the test
|
|
|
|
// case. Skip MergePut because single delete does not get removed when it
|
|
|
|
// encounters a merge.
|
|
|
|
} while (ChangeOptions(kSkipFIFOCompaction | kSkipUniversalCompaction |
|
|
|
|
kSkipMergePut));
|
2017-02-27 20:19:23 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBBasicTest, EmptyFlush) {
|
|
|
|
// It is possible to produce empty flushes when using single deletes. Tests
|
|
|
|
// whether empty flushes cause issues.
|
|
|
|
do {
|
|
|
|
Random rnd(301);
|
|
|
|
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.disable_auto_compactions = true;
|
|
|
|
CreateAndReopenWithCF({"pikachu"}, options);
|
|
|
|
|
2020-09-29 16:47:33 +00:00
|
|
|
ASSERT_OK(Put(1, "a", Slice()));
|
|
|
|
ASSERT_OK(SingleDelete(1, "a"));
|
2017-02-27 20:19:23 +00:00
|
|
|
ASSERT_OK(Flush(1));
|
|
|
|
|
|
|
|
ASSERT_EQ("[ ]", AllEntriesFor("a", 1));
|
2019-02-08 00:06:40 +00:00
|
|
|
// Skip FIFO and universal compaction as they do not apply to the test
|
|
|
|
// case. Skip MergePut because merges cannot be combined with single
|
|
|
|
// deletions.
|
|
|
|
} while (ChangeOptions(kSkipFIFOCompaction | kSkipUniversalCompaction |
|
|
|
|
kSkipMergePut));
|
2017-02-27 20:19:23 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBBasicTest, GetFromVersions) {
|
|
|
|
do {
|
|
|
|
CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
|
|
|
|
ASSERT_OK(Put(1, "foo", "v1"));
|
|
|
|
ASSERT_OK(Flush(1));
|
|
|
|
ASSERT_EQ("v1", Get(1, "foo"));
|
|
|
|
ASSERT_EQ("NOT_FOUND", Get(0, "foo"));
|
|
|
|
} while (ChangeOptions());
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBBasicTest, GetSnapshot) {
|
|
|
|
anon::OptionsOverride options_override;
|
|
|
|
options_override.skip_policy = kSkipNoSnapshot;
|
|
|
|
do {
|
|
|
|
CreateAndReopenWithCF({"pikachu"}, CurrentOptions(options_override));
|
|
|
|
// Try with both a short key and a long key
|
|
|
|
for (int i = 0; i < 2; i++) {
|
|
|
|
std::string key = (i == 0) ? std::string("foo") : std::string(200, 'x');
|
|
|
|
ASSERT_OK(Put(1, key, "v1"));
|
|
|
|
const Snapshot* s1 = db_->GetSnapshot();
|
|
|
|
ASSERT_OK(Put(1, key, "v2"));
|
|
|
|
ASSERT_EQ("v2", Get(1, key));
|
|
|
|
ASSERT_EQ("v1", Get(1, key, s1));
|
|
|
|
ASSERT_OK(Flush(1));
|
|
|
|
ASSERT_EQ("v2", Get(1, key));
|
|
|
|
ASSERT_EQ("v1", Get(1, key, s1));
|
|
|
|
db_->ReleaseSnapshot(s1);
|
|
|
|
}
|
|
|
|
} while (ChangeOptions());
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBBasicTest, CheckLock) {
|
|
|
|
do {
|
Clean up after two test failures in db_basic_test (#7682)
Summary:
In db_basic_test.cc, there are two tests that rely on the underlying
system's `LockFile` support to function correctly:
DBBasicTest.OpenWhenOpen and DBBasicTest.CheckLock. In both tests,
re-opening a db using `DB::Open` is expected to fail because the second
open cannot lock the LOCK file. Some distributed file systems, e.g. HDFS
do not support the POSIX-style file lock. Therefore, these unit tests will cause
assertion failure and the second `Open` will create a db instance.
Currently, these db instances are not closed after the assertion
failure. Since these db instances are registered with some process-wide, static
data structures, e.g. `PeriodicWorkScheduler::Default()`, they can still be
accessed after the unit tests. However, the `Env` object created for this db
instance is destroyed when the test finishes in `~DBTestBase()`. Consequently,
it causes illegal memory access.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7682
Test Plan:
Run the following on a distrubited file system:
```
make check
```
Reviewed By: anand1976
Differential Revision: D25004215
Pulled By: riversand963
fbshipit-source-id: f4327d7716c0e72b13bb43737ec9a5d156da4d52
2020-11-17 06:07:29 +00:00
|
|
|
DB* localdb = nullptr;
|
2017-02-27 20:19:23 +00:00
|
|
|
Options options = CurrentOptions();
|
|
|
|
ASSERT_OK(TryReopen(options));
|
|
|
|
|
|
|
|
// second open should fail
|
2020-03-11 23:20:13 +00:00
|
|
|
Status s = DB::Open(options, dbname_, &localdb);
|
Clean up after two test failures in db_basic_test (#7682)
Summary:
In db_basic_test.cc, there are two tests that rely on the underlying
system's `LockFile` support to function correctly:
DBBasicTest.OpenWhenOpen and DBBasicTest.CheckLock. In both tests,
re-opening a db using `DB::Open` is expected to fail because the second
open cannot lock the LOCK file. Some distributed file systems, e.g. HDFS
do not support the POSIX-style file lock. Therefore, these unit tests will cause
assertion failure and the second `Open` will create a db instance.
Currently, these db instances are not closed after the assertion
failure. Since these db instances are registered with some process-wide, static
data structures, e.g. `PeriodicWorkScheduler::Default()`, they can still be
accessed after the unit tests. However, the `Env` object created for this db
instance is destroyed when the test finishes in `~DBTestBase()`. Consequently,
it causes illegal memory access.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7682
Test Plan:
Run the following on a distrubited file system:
```
make check
```
Reviewed By: anand1976
Differential Revision: D25004215
Pulled By: riversand963
fbshipit-source-id: f4327d7716c0e72b13bb43737ec9a5d156da4d52
2020-11-17 06:07:29 +00:00
|
|
|
ASSERT_NOK(s) << [localdb]() {
|
|
|
|
delete localdb;
|
|
|
|
return "localdb open: ok";
|
|
|
|
}();
|
2020-03-11 23:20:13 +00:00
|
|
|
#ifdef OS_LINUX
|
Fix many tests to run with MEM_ENV and ENCRYPTED_ENV; Introduce a MemoryFileSystem class (#7566)
Summary:
This PR does a few things:
1. The MockFileSystem class was split out from the MockEnv. This change would theoretically allow a MockFileSystem to be used by other Environments as well (if we created a means of constructing one). The MockFileSystem implements a FileSystem in its entirety and does not rely on any Wrapper implementation.
2. Make the RocksDB test suite work when MOCK_ENV=1 and ENCRYPTED_ENV=1 are set. To accomplish this, a few things were needed:
- The tests that tried to use the "wrong" environment (Env::Default() instead of env_) were updated
- The MockFileSystem was changed to support the features it was missing or mishandled (such as recursively deleting files in a directory or supporting renaming of a directory).
3. Updated the test framework to have a ROCKSDB_GTEST_SKIP macro. This can be used to flag tests that are skipped. Currently, this defaults to doing nothing (marks the test as SUCCESS) but will mark the tests as SKIPPED when RocksDB is upgraded to a version of gtest that supports this (gtest-1.10).
I have run a full "make check" with MEM_ENV, ENCRYPTED_ENV, both, and neither under both MacOS and RedHat. A few tests were disabled/skipped for the MEM/ENCRYPTED cases. The error_handler_fs_test fails/hangs for MEM_ENV (presumably a timing problem) and I will introduce another PR/issue to track that problem. (I will also push a change to disable those tests soon). There is one more test in DBTest2 that also fails which I need to investigate or skip before this PR is merged.
Theoretically, this PR should also allow the test suite to run against an Env loaded from the registry, though I do not have one to try it with currently.
Finally, once this is accepted, it would be nice if there was a CircleCI job to run these tests on a checkin so this effort does not become stale. I do not know how to do that, so if someone could write that job, it would be appreciated :)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7566
Reviewed By: zhichao-cao
Differential Revision: D24408980
Pulled By: jay-zhuang
fbshipit-source-id: 911b1554a4d0da06fd51feca0c090a4abdcb4a5f
2020-10-27 17:31:34 +00:00
|
|
|
ASSERT_TRUE(s.ToString().find("lock ") != std::string::npos);
|
2020-03-11 23:20:13 +00:00
|
|
|
#endif // OS_LINUX
|
2017-02-27 20:19:23 +00:00
|
|
|
} while (ChangeCompactOptions());
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBBasicTest, FlushMultipleMemtable) {
|
|
|
|
do {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
WriteOptions writeOpt = WriteOptions();
|
|
|
|
writeOpt.disableWAL = true;
|
|
|
|
options.max_write_buffer_number = 4;
|
|
|
|
options.min_write_buffer_number_to_merge = 3;
|
Refactor trimming logic for immutable memtables (#5022)
Summary:
MyRocks currently sets `max_write_buffer_number_to_maintain` in order to maintain enough history for transaction conflict checking. The effectiveness of this approach depends on the size of memtables. When memtables are small, it may not keep enough history; when memtables are large, this may consume too much memory.
We are proposing a new way to configure memtable list history: by limiting the memory usage of immutable memtables. The new option is `max_write_buffer_size_to_maintain` and it will take precedence over the old `max_write_buffer_number_to_maintain` if they are both set to non-zero values. The new option accounts for the total memory usage of flushed immutable memtables and mutable memtable. When the total usage exceeds the limit, RocksDB may start dropping immutable memtables (which is also called trimming history), starting from the oldest one.
The semantics of the old option actually works both as an upper bound and lower bound. History trimming will start if number of immutable memtables exceeds the limit, but it will never go below (limit-1) due to history trimming.
In order the mimic the behavior with the new option, history trimming will stop if dropping the next immutable memtable causes the total memory usage go below the size limit. For example, assuming the size limit is set to 64MB, and there are 3 immutable memtables with sizes of 20, 30, 30. Although the total memory usage is 80MB > 64MB, dropping the oldest memtable will reduce the memory usage to 60MB < 64MB, so in this case no memtable will be dropped.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5022
Differential Revision: D14394062
Pulled By: miasantreble
fbshipit-source-id: 60457a509c6af89d0993f988c9b5c2aa9e45f5c5
2019-08-23 20:54:09 +00:00
|
|
|
options.max_write_buffer_size_to_maintain = -1;
|
2017-02-27 20:19:23 +00:00
|
|
|
CreateAndReopenWithCF({"pikachu"}, options);
|
|
|
|
ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1"));
|
|
|
|
ASSERT_OK(Flush(1));
|
|
|
|
ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1"));
|
|
|
|
|
|
|
|
ASSERT_EQ("v1", Get(1, "foo"));
|
|
|
|
ASSERT_EQ("v1", Get(1, "bar"));
|
|
|
|
ASSERT_OK(Flush(1));
|
|
|
|
} while (ChangeCompactOptions());
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBBasicTest, FlushEmptyColumnFamily) {
|
|
|
|
// Block flush thread and disable compaction thread
|
|
|
|
env_->SetBackgroundThreads(1, Env::HIGH);
|
|
|
|
env_->SetBackgroundThreads(1, Env::LOW);
|
|
|
|
test::SleepingBackgroundTask sleeping_task_low;
|
|
|
|
env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
|
|
|
|
Env::Priority::LOW);
|
|
|
|
test::SleepingBackgroundTask sleeping_task_high;
|
|
|
|
env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
|
|
|
|
&sleeping_task_high, Env::Priority::HIGH);
|
|
|
|
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
// disable compaction
|
|
|
|
options.disable_auto_compactions = true;
|
|
|
|
WriteOptions writeOpt = WriteOptions();
|
|
|
|
writeOpt.disableWAL = true;
|
|
|
|
options.max_write_buffer_number = 2;
|
|
|
|
options.min_write_buffer_number_to_merge = 1;
|
Refactor trimming logic for immutable memtables (#5022)
Summary:
MyRocks currently sets `max_write_buffer_number_to_maintain` in order to maintain enough history for transaction conflict checking. The effectiveness of this approach depends on the size of memtables. When memtables are small, it may not keep enough history; when memtables are large, this may consume too much memory.
We are proposing a new way to configure memtable list history: by limiting the memory usage of immutable memtables. The new option is `max_write_buffer_size_to_maintain` and it will take precedence over the old `max_write_buffer_number_to_maintain` if they are both set to non-zero values. The new option accounts for the total memory usage of flushed immutable memtables and mutable memtable. When the total usage exceeds the limit, RocksDB may start dropping immutable memtables (which is also called trimming history), starting from the oldest one.
The semantics of the old option actually works both as an upper bound and lower bound. History trimming will start if number of immutable memtables exceeds the limit, but it will never go below (limit-1) due to history trimming.
In order the mimic the behavior with the new option, history trimming will stop if dropping the next immutable memtable causes the total memory usage go below the size limit. For example, assuming the size limit is set to 64MB, and there are 3 immutable memtables with sizes of 20, 30, 30. Although the total memory usage is 80MB > 64MB, dropping the oldest memtable will reduce the memory usage to 60MB < 64MB, so in this case no memtable will be dropped.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5022
Differential Revision: D14394062
Pulled By: miasantreble
fbshipit-source-id: 60457a509c6af89d0993f988c9b5c2aa9e45f5c5
2019-08-23 20:54:09 +00:00
|
|
|
options.max_write_buffer_size_to_maintain =
|
|
|
|
static_cast<int64_t>(options.write_buffer_size);
|
2017-02-27 20:19:23 +00:00
|
|
|
CreateAndReopenWithCF({"pikachu"}, options);
|
|
|
|
|
|
|
|
// Compaction can still go through even if no thread can flush the
|
|
|
|
// mem table.
|
|
|
|
ASSERT_OK(Flush(0));
|
|
|
|
ASSERT_OK(Flush(1));
|
|
|
|
|
|
|
|
// Insert can go through
|
|
|
|
ASSERT_OK(dbfull()->Put(writeOpt, handles_[0], "foo", "v1"));
|
|
|
|
ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1"));
|
|
|
|
|
|
|
|
ASSERT_EQ("v1", Get(0, "foo"));
|
|
|
|
ASSERT_EQ("v1", Get(1, "bar"));
|
|
|
|
|
|
|
|
sleeping_task_high.WakeUp();
|
|
|
|
sleeping_task_high.WaitUntilDone();
|
|
|
|
|
|
|
|
// Flush can still go through.
|
|
|
|
ASSERT_OK(Flush(0));
|
|
|
|
ASSERT_OK(Flush(1));
|
|
|
|
|
|
|
|
sleeping_task_low.WakeUp();
|
|
|
|
sleeping_task_low.WaitUntilDone();
|
|
|
|
}
|
|
|
|
|
2020-03-07 00:21:03 +00:00
|
|
|
TEST_F(DBBasicTest, Flush) {
|
2017-02-27 20:19:23 +00:00
|
|
|
do {
|
|
|
|
CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
|
|
|
|
WriteOptions writeOpt = WriteOptions();
|
|
|
|
writeOpt.disableWAL = true;
|
|
|
|
SetPerfLevel(kEnableTime);
|
|
|
|
ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1"));
|
|
|
|
// this will now also flush the last 2 writes
|
|
|
|
ASSERT_OK(Flush(1));
|
|
|
|
ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1"));
|
|
|
|
|
2017-06-03 00:12:39 +00:00
|
|
|
get_perf_context()->Reset();
|
2017-02-27 20:19:23 +00:00
|
|
|
Get(1, "foo");
|
2017-06-03 00:12:39 +00:00
|
|
|
ASSERT_TRUE((int)get_perf_context()->get_from_output_files_time > 0);
|
2017-08-18 18:40:36 +00:00
|
|
|
ASSERT_EQ(2, (int)get_perf_context()->get_read_bytes);
|
2017-02-27 20:19:23 +00:00
|
|
|
|
|
|
|
ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
|
|
|
|
ASSERT_EQ("v1", Get(1, "foo"));
|
|
|
|
ASSERT_EQ("v1", Get(1, "bar"));
|
|
|
|
|
|
|
|
writeOpt.disableWAL = true;
|
|
|
|
ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v2"));
|
|
|
|
ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v2"));
|
|
|
|
ASSERT_OK(Flush(1));
|
|
|
|
|
|
|
|
ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
|
|
|
|
ASSERT_EQ("v2", Get(1, "bar"));
|
2017-06-03 00:12:39 +00:00
|
|
|
get_perf_context()->Reset();
|
2017-02-27 20:19:23 +00:00
|
|
|
ASSERT_EQ("v2", Get(1, "foo"));
|
2017-06-03 00:12:39 +00:00
|
|
|
ASSERT_TRUE((int)get_perf_context()->get_from_output_files_time > 0);
|
2017-02-27 20:19:23 +00:00
|
|
|
|
|
|
|
writeOpt.disableWAL = false;
|
|
|
|
ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v3"));
|
|
|
|
ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v3"));
|
|
|
|
ASSERT_OK(Flush(1));
|
|
|
|
|
|
|
|
ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
|
|
|
|
// 'foo' should be there because its put
|
|
|
|
// has WAL enabled.
|
|
|
|
ASSERT_EQ("v3", Get(1, "foo"));
|
|
|
|
ASSERT_EQ("v3", Get(1, "bar"));
|
|
|
|
|
|
|
|
SetPerfLevel(kDisable);
|
|
|
|
} while (ChangeCompactOptions());
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBBasicTest, ManifestRollOver) {
|
|
|
|
do {
|
|
|
|
Options options;
|
|
|
|
options.max_manifest_file_size = 10; // 10 bytes
|
|
|
|
options = CurrentOptions(options);
|
|
|
|
CreateAndReopenWithCF({"pikachu"}, options);
|
|
|
|
{
|
|
|
|
ASSERT_OK(Put(1, "manifest_key1", std::string(1000, '1')));
|
|
|
|
ASSERT_OK(Put(1, "manifest_key2", std::string(1000, '2')));
|
|
|
|
ASSERT_OK(Put(1, "manifest_key3", std::string(1000, '3')));
|
|
|
|
uint64_t manifest_before_flush = dbfull()->TEST_Current_Manifest_FileNo();
|
|
|
|
ASSERT_OK(Flush(1)); // This should trigger LogAndApply.
|
|
|
|
uint64_t manifest_after_flush = dbfull()->TEST_Current_Manifest_FileNo();
|
|
|
|
ASSERT_GT(manifest_after_flush, manifest_before_flush);
|
|
|
|
ReopenWithColumnFamilies({"default", "pikachu"}, options);
|
|
|
|
ASSERT_GT(dbfull()->TEST_Current_Manifest_FileNo(), manifest_after_flush);
|
|
|
|
// check if a new manifest file got inserted or not.
|
|
|
|
ASSERT_EQ(std::string(1000, '1'), Get(1, "manifest_key1"));
|
|
|
|
ASSERT_EQ(std::string(1000, '2'), Get(1, "manifest_key2"));
|
|
|
|
ASSERT_EQ(std::string(1000, '3'), Get(1, "manifest_key3"));
|
|
|
|
}
|
|
|
|
} while (ChangeCompactOptions());
|
|
|
|
}
|
|
|
|
|
2022-06-15 22:39:49 +00:00
|
|
|
TEST_F(DBBasicTest, IdentityAcrossRestarts) {
|
|
|
|
constexpr size_t kMinIdSize = 10;
|
2019-09-03 15:50:47 +00:00
|
|
|
do {
|
2022-06-15 22:39:49 +00:00
|
|
|
for (bool with_manifest : {false, true}) {
|
|
|
|
std::string idfilename = IdentityFileName(dbname_);
|
|
|
|
std::string id1, tmp;
|
|
|
|
ASSERT_OK(db_->GetDbIdentity(id1));
|
|
|
|
ASSERT_GE(id1.size(), kMinIdSize);
|
|
|
|
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.write_dbid_to_manifest = with_manifest;
|
|
|
|
Reopen(options);
|
|
|
|
std::string id2;
|
|
|
|
ASSERT_OK(db_->GetDbIdentity(id2));
|
|
|
|
// id2 should match id1 because identity was not regenerated
|
|
|
|
ASSERT_EQ(id1, id2);
|
|
|
|
ASSERT_OK(ReadFileToString(env_, idfilename, &tmp));
|
|
|
|
ASSERT_EQ(tmp, id2);
|
|
|
|
|
|
|
|
// Recover from deleted/missing IDENTITY
|
|
|
|
ASSERT_OK(env_->DeleteFile(idfilename));
|
|
|
|
Reopen(options);
|
|
|
|
std::string id3;
|
|
|
|
ASSERT_OK(db_->GetDbIdentity(id3));
|
|
|
|
if (with_manifest) {
|
|
|
|
// id3 should match id1 because identity was restored from manifest
|
|
|
|
ASSERT_EQ(id1, id3);
|
|
|
|
} else {
|
|
|
|
// id3 should NOT match id1 because identity was regenerated
|
|
|
|
ASSERT_NE(id1, id3);
|
|
|
|
ASSERT_GE(id3.size(), kMinIdSize);
|
|
|
|
}
|
|
|
|
ASSERT_OK(ReadFileToString(env_, idfilename, &tmp));
|
|
|
|
ASSERT_EQ(tmp, id3);
|
|
|
|
|
|
|
|
// Recover from truncated IDENTITY
|
|
|
|
{
|
|
|
|
std::unique_ptr<WritableFile> w;
|
|
|
|
ASSERT_OK(env_->NewWritableFile(idfilename, &w, EnvOptions()));
|
|
|
|
ASSERT_OK(w->Close());
|
|
|
|
}
|
|
|
|
Reopen(options);
|
|
|
|
std::string id4;
|
|
|
|
ASSERT_OK(db_->GetDbIdentity(id4));
|
|
|
|
if (with_manifest) {
|
|
|
|
// id4 should match id1 because identity was restored from manifest
|
|
|
|
ASSERT_EQ(id1, id4);
|
|
|
|
} else {
|
|
|
|
// id4 should NOT match id1 because identity was regenerated
|
|
|
|
ASSERT_NE(id1, id4);
|
|
|
|
ASSERT_GE(id4.size(), kMinIdSize);
|
|
|
|
}
|
|
|
|
ASSERT_OK(ReadFileToString(env_, idfilename, &tmp));
|
|
|
|
ASSERT_EQ(tmp, id4);
|
|
|
|
|
|
|
|
// Recover from overwritten IDENTITY
|
|
|
|
std::string silly_id = "asdf123456789";
|
|
|
|
{
|
|
|
|
std::unique_ptr<WritableFile> w;
|
|
|
|
ASSERT_OK(env_->NewWritableFile(idfilename, &w, EnvOptions()));
|
|
|
|
ASSERT_OK(w->Append(silly_id));
|
|
|
|
ASSERT_OK(w->Close());
|
|
|
|
}
|
|
|
|
Reopen(options);
|
|
|
|
std::string id5;
|
|
|
|
ASSERT_OK(db_->GetDbIdentity(id5));
|
|
|
|
if (with_manifest) {
|
|
|
|
// id4 should match id1 because identity was restored from manifest
|
|
|
|
ASSERT_EQ(id1, id5);
|
|
|
|
} else {
|
|
|
|
ASSERT_EQ(id5, silly_id);
|
|
|
|
}
|
|
|
|
ASSERT_OK(ReadFileToString(env_, idfilename, &tmp));
|
|
|
|
ASSERT_EQ(tmp, id5);
|
2019-09-03 15:50:47 +00:00
|
|
|
}
|
|
|
|
} while (ChangeCompactOptions());
|
|
|
|
}
|
|
|
|
|
2023-01-24 20:55:03 +00:00
|
|
|
TEST_F(DBBasicTest, LockFileRecovery) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
// Regardless of best_efforts_recovery
|
|
|
|
for (bool ber : {false, true}) {
|
|
|
|
options.best_efforts_recovery = ber;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
std::string id1, id2;
|
|
|
|
ASSERT_OK(db_->GetDbIdentity(id1));
|
|
|
|
Close();
|
|
|
|
|
|
|
|
// Should be OK to re-open DB after lock file deleted
|
|
|
|
std::string lockfilename = LockFileName(dbname_);
|
|
|
|
ASSERT_OK(env_->DeleteFile(lockfilename));
|
|
|
|
Reopen(options);
|
|
|
|
|
|
|
|
// Should be same DB as before
|
|
|
|
ASSERT_OK(db_->GetDbIdentity(id2));
|
|
|
|
ASSERT_EQ(id1, id2);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-02-27 20:19:23 +00:00
|
|
|
TEST_F(DBBasicTest, Snapshot) {
|
Fix+clean up handling of mock sleeps (#7101)
Summary:
We have a number of tests hanging on MacOS and windows due to
mishandling of code for mock sleeps. In addition, the code was in
terrible shape because the same variable (addon_time_) would sometimes
refer to microseconds and sometimes to seconds. One test even assumed it
was nanoseconds but was written to pass anyway.
This has been cleaned up so that DB tests generally use a SpecialEnv
function to mock sleep, for either some number of microseconds or seconds
depending on the function called. But to call one of these, the test must first
call SetMockSleep (precondition enforced with assertion), which also turns
sleeps in RocksDB into mock sleeps. To also removes accounting for actual
clock time, call SetTimeElapseOnlySleepOnReopen, which implies
SetMockSleep (on DB re-open). This latter setting only works by applying
on DB re-open, otherwise havoc can ensue if Env goes back in time with
DB open.
More specifics:
Removed some unused test classes, and updated comments on the general
problem.
Fixed DBSSTTest.GetTotalSstFilesSize using a sync point callback instead
of mock time. For this we have the only modification to production code,
inserting a sync point callback in flush_job.cc, which is not a change to
production behavior.
Removed unnecessary resetting of mock times to 0 in many tests. RocksDB
deals in relative time. Any behaviors relying on absolute date/time are likely
a bug. (The above test DBSSTTest.GetTotalSstFilesSize was the only one
clearly injecting a specific absolute time for actual testing convenience.) Just
in case I misunderstood some test, I put this note in each replacement:
// NOTE: Presumed unnecessary and removed: resetting mock time in env
Strengthened some tests like MergeTestTime, MergeCompactionTimeTest, and
FilterCompactionTimeTest in db_test.cc
stats_history_test and blob_db_test are each their own beast, rather deeply
dependent on MockTimeEnv. Each gets its own variant of a work-around for
TimedWait in a mock time environment. (Reduces redundancy and
inconsistency in stats_history_test.)
Intended follow-up:
Remove TimedWait from the public API of InstrumentedCondVar, and only
make that accessible through Env by passing in an InstrumentedCondVar and
a deadline. Then the Env implementations mocking time can fix this problem
without using sync points. (Test infrastructure using sync points interferes
with individual tests' control over sync points.)
With that change, we can simplify/consolidate the scattered work-arounds.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7101
Test Plan: make check on Linux and MacOS
Reviewed By: zhichao-cao
Differential Revision: D23032815
Pulled By: pdillinger
fbshipit-source-id: 7f33967ada8b83011fb54e8279365c008bd6610b
2020-08-11 19:39:49 +00:00
|
|
|
env_->SetMockSleep();
|
2017-02-27 20:19:23 +00:00
|
|
|
anon::OptionsOverride options_override;
|
|
|
|
options_override.skip_policy = kSkipNoSnapshot;
|
|
|
|
do {
|
|
|
|
CreateAndReopenWithCF({"pikachu"}, CurrentOptions(options_override));
|
2020-09-29 16:47:33 +00:00
|
|
|
ASSERT_OK(Put(0, "foo", "0v1"));
|
|
|
|
ASSERT_OK(Put(1, "foo", "1v1"));
|
2017-02-27 20:19:23 +00:00
|
|
|
|
|
|
|
const Snapshot* s1 = db_->GetSnapshot();
|
|
|
|
ASSERT_EQ(1U, GetNumSnapshots());
|
|
|
|
uint64_t time_snap1 = GetTimeOldestSnapshots();
|
|
|
|
ASSERT_GT(time_snap1, 0U);
|
2020-01-07 16:35:12 +00:00
|
|
|
ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber());
|
2020-09-29 16:47:33 +00:00
|
|
|
ASSERT_OK(Put(0, "foo", "0v2"));
|
|
|
|
ASSERT_OK(Put(1, "foo", "1v2"));
|
2017-02-27 20:19:23 +00:00
|
|
|
|
Fix+clean up handling of mock sleeps (#7101)
Summary:
We have a number of tests hanging on MacOS and windows due to
mishandling of code for mock sleeps. In addition, the code was in
terrible shape because the same variable (addon_time_) would sometimes
refer to microseconds and sometimes to seconds. One test even assumed it
was nanoseconds but was written to pass anyway.
This has been cleaned up so that DB tests generally use a SpecialEnv
function to mock sleep, for either some number of microseconds or seconds
depending on the function called. But to call one of these, the test must first
call SetMockSleep (precondition enforced with assertion), which also turns
sleeps in RocksDB into mock sleeps. To also removes accounting for actual
clock time, call SetTimeElapseOnlySleepOnReopen, which implies
SetMockSleep (on DB re-open). This latter setting only works by applying
on DB re-open, otherwise havoc can ensue if Env goes back in time with
DB open.
More specifics:
Removed some unused test classes, and updated comments on the general
problem.
Fixed DBSSTTest.GetTotalSstFilesSize using a sync point callback instead
of mock time. For this we have the only modification to production code,
inserting a sync point callback in flush_job.cc, which is not a change to
production behavior.
Removed unnecessary resetting of mock times to 0 in many tests. RocksDB
deals in relative time. Any behaviors relying on absolute date/time are likely
a bug. (The above test DBSSTTest.GetTotalSstFilesSize was the only one
clearly injecting a specific absolute time for actual testing convenience.) Just
in case I misunderstood some test, I put this note in each replacement:
// NOTE: Presumed unnecessary and removed: resetting mock time in env
Strengthened some tests like MergeTestTime, MergeCompactionTimeTest, and
FilterCompactionTimeTest in db_test.cc
stats_history_test and blob_db_test are each their own beast, rather deeply
dependent on MockTimeEnv. Each gets its own variant of a work-around for
TimedWait in a mock time environment. (Reduces redundancy and
inconsistency in stats_history_test.)
Intended follow-up:
Remove TimedWait from the public API of InstrumentedCondVar, and only
make that accessible through Env by passing in an InstrumentedCondVar and
a deadline. Then the Env implementations mocking time can fix this problem
without using sync points. (Test infrastructure using sync points interferes
with individual tests' control over sync points.)
With that change, we can simplify/consolidate the scattered work-arounds.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7101
Test Plan: make check on Linux and MacOS
Reviewed By: zhichao-cao
Differential Revision: D23032815
Pulled By: pdillinger
fbshipit-source-id: 7f33967ada8b83011fb54e8279365c008bd6610b
2020-08-11 19:39:49 +00:00
|
|
|
env_->MockSleepForSeconds(1);
|
2017-02-27 20:19:23 +00:00
|
|
|
|
|
|
|
const Snapshot* s2 = db_->GetSnapshot();
|
|
|
|
ASSERT_EQ(2U, GetNumSnapshots());
|
|
|
|
ASSERT_EQ(time_snap1, GetTimeOldestSnapshots());
|
2020-01-07 16:35:12 +00:00
|
|
|
ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber());
|
2020-09-29 16:47:33 +00:00
|
|
|
ASSERT_OK(Put(0, "foo", "0v3"));
|
|
|
|
ASSERT_OK(Put(1, "foo", "1v3"));
|
2017-02-27 20:19:23 +00:00
|
|
|
|
|
|
|
{
|
|
|
|
ManagedSnapshot s3(db_);
|
|
|
|
ASSERT_EQ(3U, GetNumSnapshots());
|
|
|
|
ASSERT_EQ(time_snap1, GetTimeOldestSnapshots());
|
2020-01-07 16:35:12 +00:00
|
|
|
ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber());
|
2017-02-27 20:19:23 +00:00
|
|
|
|
2020-09-29 16:47:33 +00:00
|
|
|
ASSERT_OK(Put(0, "foo", "0v4"));
|
|
|
|
ASSERT_OK(Put(1, "foo", "1v4"));
|
2017-02-27 20:19:23 +00:00
|
|
|
ASSERT_EQ("0v1", Get(0, "foo", s1));
|
|
|
|
ASSERT_EQ("1v1", Get(1, "foo", s1));
|
|
|
|
ASSERT_EQ("0v2", Get(0, "foo", s2));
|
|
|
|
ASSERT_EQ("1v2", Get(1, "foo", s2));
|
|
|
|
ASSERT_EQ("0v3", Get(0, "foo", s3.snapshot()));
|
|
|
|
ASSERT_EQ("1v3", Get(1, "foo", s3.snapshot()));
|
|
|
|
ASSERT_EQ("0v4", Get(0, "foo"));
|
|
|
|
ASSERT_EQ("1v4", Get(1, "foo"));
|
|
|
|
}
|
|
|
|
|
|
|
|
ASSERT_EQ(2U, GetNumSnapshots());
|
|
|
|
ASSERT_EQ(time_snap1, GetTimeOldestSnapshots());
|
2020-01-07 16:35:12 +00:00
|
|
|
ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber());
|
2017-02-27 20:19:23 +00:00
|
|
|
ASSERT_EQ("0v1", Get(0, "foo", s1));
|
|
|
|
ASSERT_EQ("1v1", Get(1, "foo", s1));
|
|
|
|
ASSERT_EQ("0v2", Get(0, "foo", s2));
|
|
|
|
ASSERT_EQ("1v2", Get(1, "foo", s2));
|
|
|
|
ASSERT_EQ("0v4", Get(0, "foo"));
|
|
|
|
ASSERT_EQ("1v4", Get(1, "foo"));
|
|
|
|
|
|
|
|
db_->ReleaseSnapshot(s1);
|
|
|
|
ASSERT_EQ("0v2", Get(0, "foo", s2));
|
|
|
|
ASSERT_EQ("1v2", Get(1, "foo", s2));
|
|
|
|
ASSERT_EQ("0v4", Get(0, "foo"));
|
|
|
|
ASSERT_EQ("1v4", Get(1, "foo"));
|
|
|
|
ASSERT_EQ(1U, GetNumSnapshots());
|
|
|
|
ASSERT_LT(time_snap1, GetTimeOldestSnapshots());
|
2020-01-07 16:35:12 +00:00
|
|
|
ASSERT_EQ(GetSequenceOldestSnapshots(), s2->GetSequenceNumber());
|
2017-02-27 20:19:23 +00:00
|
|
|
|
|
|
|
db_->ReleaseSnapshot(s2);
|
|
|
|
ASSERT_EQ(0U, GetNumSnapshots());
|
2020-01-07 16:35:12 +00:00
|
|
|
ASSERT_EQ(GetSequenceOldestSnapshots(), 0);
|
2017-02-27 20:19:23 +00:00
|
|
|
ASSERT_EQ("0v4", Get(0, "foo"));
|
|
|
|
ASSERT_EQ("1v4", Get(1, "foo"));
|
2019-02-08 00:06:40 +00:00
|
|
|
} while (ChangeOptions());
|
2017-02-27 20:19:23 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2020-08-25 22:40:49 +00:00
|
|
|
class DBBasicMultiConfigs : public DBBasicTest,
|
|
|
|
public ::testing::WithParamInterface<int> {
|
|
|
|
public:
|
|
|
|
DBBasicMultiConfigs() { option_config_ = GetParam(); }
|
|
|
|
|
|
|
|
static std::vector<int> GenerateOptionConfigs() {
|
|
|
|
std::vector<int> option_configs;
|
|
|
|
for (int option_config = kDefault; option_config < kEnd; ++option_config) {
|
|
|
|
if (!ShouldSkipOptions(option_config, kSkipFIFOCompaction)) {
|
|
|
|
option_configs.push_back(option_config);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return option_configs;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
TEST_P(DBBasicMultiConfigs, CompactBetweenSnapshots) {
|
2017-02-27 20:19:23 +00:00
|
|
|
anon::OptionsOverride options_override;
|
|
|
|
options_override.skip_policy = kSkipNoSnapshot;
|
2020-08-25 22:40:49 +00:00
|
|
|
Options options = CurrentOptions(options_override);
|
|
|
|
options.disable_auto_compactions = true;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
CreateAndReopenWithCF({"pikachu"}, options);
|
|
|
|
Random rnd(301);
|
|
|
|
FillLevels("a", "z", 1);
|
|
|
|
|
2020-09-29 16:47:33 +00:00
|
|
|
ASSERT_OK(Put(1, "foo", "first"));
|
2020-08-25 22:40:49 +00:00
|
|
|
const Snapshot* snapshot1 = db_->GetSnapshot();
|
2020-09-29 16:47:33 +00:00
|
|
|
ASSERT_OK(Put(1, "foo", "second"));
|
|
|
|
ASSERT_OK(Put(1, "foo", "third"));
|
|
|
|
ASSERT_OK(Put(1, "foo", "fourth"));
|
2020-08-25 22:40:49 +00:00
|
|
|
const Snapshot* snapshot2 = db_->GetSnapshot();
|
2020-09-29 16:47:33 +00:00
|
|
|
ASSERT_OK(Put(1, "foo", "fifth"));
|
|
|
|
ASSERT_OK(Put(1, "foo", "sixth"));
|
2020-08-25 22:40:49 +00:00
|
|
|
|
|
|
|
// All entries (including duplicates) exist
|
|
|
|
// before any compaction or flush is triggered.
|
|
|
|
ASSERT_EQ(AllEntriesFor("foo", 1),
|
|
|
|
"[ sixth, fifth, fourth, third, second, first ]");
|
|
|
|
ASSERT_EQ("sixth", Get(1, "foo"));
|
|
|
|
ASSERT_EQ("fourth", Get(1, "foo", snapshot2));
|
|
|
|
ASSERT_EQ("first", Get(1, "foo", snapshot1));
|
|
|
|
|
|
|
|
// After a flush, "second", "third" and "fifth" should
|
|
|
|
// be removed
|
|
|
|
ASSERT_OK(Flush(1));
|
|
|
|
ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth, fourth, first ]");
|
|
|
|
|
|
|
|
// after we release the snapshot1, only two values left
|
|
|
|
db_->ReleaseSnapshot(snapshot1);
|
|
|
|
FillLevels("a", "z", 1);
|
2020-09-29 16:47:33 +00:00
|
|
|
ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
|
|
|
|
nullptr));
|
2020-08-25 22:40:49 +00:00
|
|
|
|
|
|
|
// We have only one valid snapshot snapshot2. Since snapshot1 is
|
|
|
|
// not valid anymore, "first" should be removed by a compaction.
|
|
|
|
ASSERT_EQ("sixth", Get(1, "foo"));
|
|
|
|
ASSERT_EQ("fourth", Get(1, "foo", snapshot2));
|
|
|
|
ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth, fourth ]");
|
|
|
|
|
|
|
|
// after we release the snapshot2, only one value should be left
|
|
|
|
db_->ReleaseSnapshot(snapshot2);
|
|
|
|
FillLevels("a", "z", 1);
|
2020-09-29 16:47:33 +00:00
|
|
|
ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
|
|
|
|
nullptr));
|
2020-08-25 22:40:49 +00:00
|
|
|
ASSERT_EQ("sixth", Get(1, "foo"));
|
|
|
|
ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth ]");
|
2017-02-27 20:19:23 +00:00
|
|
|
}
|
|
|
|
|
2020-08-25 22:40:49 +00:00
|
|
|
INSTANTIATE_TEST_CASE_P(
|
|
|
|
DBBasicMultiConfigs, DBBasicMultiConfigs,
|
|
|
|
::testing::ValuesIn(DBBasicMultiConfigs::GenerateOptionConfigs()));
|
|
|
|
|
2017-02-27 20:19:23 +00:00
|
|
|
TEST_F(DBBasicTest, DBOpen_Options) {
|
|
|
|
Options options = CurrentOptions();
|
2018-01-10 23:36:48 +00:00
|
|
|
Close();
|
|
|
|
Destroy(options);
|
2017-02-27 20:19:23 +00:00
|
|
|
|
|
|
|
// Does not exist, and create_if_missing == false: error
|
|
|
|
DB* db = nullptr;
|
|
|
|
options.create_if_missing = false;
|
2018-01-10 23:36:48 +00:00
|
|
|
Status s = DB::Open(options, dbname_, &db);
|
2017-02-27 20:19:23 +00:00
|
|
|
ASSERT_TRUE(strstr(s.ToString().c_str(), "does not exist") != nullptr);
|
|
|
|
ASSERT_TRUE(db == nullptr);
|
|
|
|
|
|
|
|
// Does not exist, and create_if_missing == true: OK
|
|
|
|
options.create_if_missing = true;
|
2018-01-10 23:36:48 +00:00
|
|
|
s = DB::Open(options, dbname_, &db);
|
2017-02-27 20:19:23 +00:00
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_TRUE(db != nullptr);
|
|
|
|
|
|
|
|
delete db;
|
|
|
|
db = nullptr;
|
|
|
|
|
|
|
|
// Does exist, and error_if_exists == true: error
|
|
|
|
options.create_if_missing = false;
|
|
|
|
options.error_if_exists = true;
|
2018-01-10 23:36:48 +00:00
|
|
|
s = DB::Open(options, dbname_, &db);
|
2017-02-27 20:19:23 +00:00
|
|
|
ASSERT_TRUE(strstr(s.ToString().c_str(), "exists") != nullptr);
|
|
|
|
ASSERT_TRUE(db == nullptr);
|
|
|
|
|
|
|
|
// Does exist, and error_if_exists == false: OK
|
|
|
|
options.create_if_missing = true;
|
|
|
|
options.error_if_exists = false;
|
2018-01-10 23:36:48 +00:00
|
|
|
s = DB::Open(options, dbname_, &db);
|
2017-02-27 20:19:23 +00:00
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_TRUE(db != nullptr);
|
|
|
|
|
|
|
|
delete db;
|
|
|
|
db = nullptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBBasicTest, CompactOnFlush) {
|
|
|
|
anon::OptionsOverride options_override;
|
|
|
|
options_override.skip_policy = kSkipNoSnapshot;
|
|
|
|
do {
|
|
|
|
Options options = CurrentOptions(options_override);
|
|
|
|
options.disable_auto_compactions = true;
|
|
|
|
CreateAndReopenWithCF({"pikachu"}, options);
|
|
|
|
|
2020-09-29 16:47:33 +00:00
|
|
|
ASSERT_OK(Put(1, "foo", "v1"));
|
2017-02-27 20:19:23 +00:00
|
|
|
ASSERT_OK(Flush(1));
|
|
|
|
ASSERT_EQ(AllEntriesFor("foo", 1), "[ v1 ]");
|
|
|
|
|
|
|
|
// Write two new keys
|
2020-09-29 16:47:33 +00:00
|
|
|
ASSERT_OK(Put(1, "a", "begin"));
|
|
|
|
ASSERT_OK(Put(1, "z", "end"));
|
|
|
|
ASSERT_OK(Flush(1));
|
2017-02-27 20:19:23 +00:00
|
|
|
|
|
|
|
// Case1: Delete followed by a put
|
2020-09-29 16:47:33 +00:00
|
|
|
ASSERT_OK(Delete(1, "foo"));
|
|
|
|
ASSERT_OK(Put(1, "foo", "v2"));
|
2017-02-27 20:19:23 +00:00
|
|
|
ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, DEL, v1 ]");
|
|
|
|
|
|
|
|
// After the current memtable is flushed, the DEL should
|
|
|
|
// have been removed
|
|
|
|
ASSERT_OK(Flush(1));
|
|
|
|
ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, v1 ]");
|
|
|
|
|
2020-09-29 16:47:33 +00:00
|
|
|
ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1],
|
|
|
|
nullptr, nullptr));
|
2017-02-27 20:19:23 +00:00
|
|
|
ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2 ]");
|
|
|
|
|
|
|
|
// Case 2: Delete followed by another delete
|
2020-09-29 16:47:33 +00:00
|
|
|
ASSERT_OK(Delete(1, "foo"));
|
|
|
|
ASSERT_OK(Delete(1, "foo"));
|
2017-02-27 20:19:23 +00:00
|
|
|
ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, DEL, v2 ]");
|
|
|
|
ASSERT_OK(Flush(1));
|
|
|
|
ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v2 ]");
|
2020-09-29 16:47:33 +00:00
|
|
|
ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1],
|
|
|
|
nullptr, nullptr));
|
2017-02-27 20:19:23 +00:00
|
|
|
ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]");
|
|
|
|
|
|
|
|
// Case 3: Put followed by a delete
|
2020-09-29 16:47:33 +00:00
|
|
|
ASSERT_OK(Put(1, "foo", "v3"));
|
|
|
|
ASSERT_OK(Delete(1, "foo"));
|
2017-02-27 20:19:23 +00:00
|
|
|
ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v3 ]");
|
|
|
|
ASSERT_OK(Flush(1));
|
|
|
|
ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL ]");
|
2020-09-29 16:47:33 +00:00
|
|
|
ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1],
|
|
|
|
nullptr, nullptr));
|
2017-02-27 20:19:23 +00:00
|
|
|
ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]");
|
|
|
|
|
|
|
|
// Case 4: Put followed by another Put
|
2020-09-29 16:47:33 +00:00
|
|
|
ASSERT_OK(Put(1, "foo", "v4"));
|
|
|
|
ASSERT_OK(Put(1, "foo", "v5"));
|
2017-02-27 20:19:23 +00:00
|
|
|
ASSERT_EQ(AllEntriesFor("foo", 1), "[ v5, v4 ]");
|
|
|
|
ASSERT_OK(Flush(1));
|
|
|
|
ASSERT_EQ(AllEntriesFor("foo", 1), "[ v5 ]");
|
2020-09-29 16:47:33 +00:00
|
|
|
ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1],
|
|
|
|
nullptr, nullptr));
|
2017-02-27 20:19:23 +00:00
|
|
|
ASSERT_EQ(AllEntriesFor("foo", 1), "[ v5 ]");
|
|
|
|
|
|
|
|
// clear database
|
2020-09-29 16:47:33 +00:00
|
|
|
ASSERT_OK(Delete(1, "foo"));
|
|
|
|
ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1],
|
|
|
|
nullptr, nullptr));
|
2017-02-27 20:19:23 +00:00
|
|
|
ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]");
|
|
|
|
|
|
|
|
// Case 5: Put followed by snapshot followed by another Put
|
|
|
|
// Both puts should remain.
|
2020-09-29 16:47:33 +00:00
|
|
|
ASSERT_OK(Put(1, "foo", "v6"));
|
2017-02-27 20:19:23 +00:00
|
|
|
const Snapshot* snapshot = db_->GetSnapshot();
|
2020-09-29 16:47:33 +00:00
|
|
|
ASSERT_OK(Put(1, "foo", "v7"));
|
2017-02-27 20:19:23 +00:00
|
|
|
ASSERT_OK(Flush(1));
|
|
|
|
ASSERT_EQ(AllEntriesFor("foo", 1), "[ v7, v6 ]");
|
|
|
|
db_->ReleaseSnapshot(snapshot);
|
|
|
|
|
|
|
|
// clear database
|
2020-09-29 16:47:33 +00:00
|
|
|
ASSERT_OK(Delete(1, "foo"));
|
|
|
|
ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1],
|
|
|
|
nullptr, nullptr));
|
2017-02-27 20:19:23 +00:00
|
|
|
ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]");
|
|
|
|
|
|
|
|
// Case 5: snapshot followed by a put followed by another Put
|
|
|
|
// Only the last put should remain.
|
|
|
|
const Snapshot* snapshot1 = db_->GetSnapshot();
|
2020-09-29 16:47:33 +00:00
|
|
|
ASSERT_OK(Put(1, "foo", "v8"));
|
|
|
|
ASSERT_OK(Put(1, "foo", "v9"));
|
2017-02-27 20:19:23 +00:00
|
|
|
ASSERT_OK(Flush(1));
|
|
|
|
ASSERT_EQ(AllEntriesFor("foo", 1), "[ v9 ]");
|
|
|
|
db_->ReleaseSnapshot(snapshot1);
|
|
|
|
} while (ChangeCompactOptions());
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBBasicTest, FlushOneColumnFamily) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich",
|
|
|
|
"alyosha", "popovich"},
|
|
|
|
options);
|
|
|
|
|
|
|
|
ASSERT_OK(Put(0, "Default", "Default"));
|
|
|
|
ASSERT_OK(Put(1, "pikachu", "pikachu"));
|
|
|
|
ASSERT_OK(Put(2, "ilya", "ilya"));
|
|
|
|
ASSERT_OK(Put(3, "muromec", "muromec"));
|
|
|
|
ASSERT_OK(Put(4, "dobrynia", "dobrynia"));
|
|
|
|
ASSERT_OK(Put(5, "nikitich", "nikitich"));
|
|
|
|
ASSERT_OK(Put(6, "alyosha", "alyosha"));
|
|
|
|
ASSERT_OK(Put(7, "popovich", "popovich"));
|
|
|
|
|
|
|
|
for (int i = 0; i < 8; ++i) {
|
2020-09-29 16:47:33 +00:00
|
|
|
ASSERT_OK(Flush(i));
|
2017-02-27 20:19:23 +00:00
|
|
|
auto tables = ListTableFiles(env_, dbname_);
|
|
|
|
ASSERT_EQ(tables.size(), i + 1U);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBBasicTest, MultiGetSimple) {
|
|
|
|
do {
|
|
|
|
CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
|
2017-08-18 18:40:36 +00:00
|
|
|
SetPerfLevel(kEnableCount);
|
2017-02-27 20:19:23 +00:00
|
|
|
ASSERT_OK(Put(1, "k1", "v1"));
|
|
|
|
ASSERT_OK(Put(1, "k2", "v2"));
|
|
|
|
ASSERT_OK(Put(1, "k3", "v3"));
|
|
|
|
ASSERT_OK(Put(1, "k4", "v4"));
|
|
|
|
ASSERT_OK(Delete(1, "k4"));
|
|
|
|
ASSERT_OK(Put(1, "k5", "v5"));
|
|
|
|
ASSERT_OK(Delete(1, "no_key"));
|
|
|
|
|
|
|
|
std::vector<Slice> keys({"k1", "k2", "k3", "k4", "k5", "no_key"});
|
|
|
|
|
|
|
|
std::vector<std::string> values(20, "Temporary data to be overwritten");
|
|
|
|
std::vector<ColumnFamilyHandle*> cfs(keys.size(), handles_[1]);
|
|
|
|
|
2017-08-18 18:40:36 +00:00
|
|
|
get_perf_context()->Reset();
|
2017-02-27 20:19:23 +00:00
|
|
|
std::vector<Status> s = db_->MultiGet(ReadOptions(), cfs, keys, &values);
|
|
|
|
ASSERT_EQ(values.size(), keys.size());
|
|
|
|
ASSERT_EQ(values[0], "v1");
|
|
|
|
ASSERT_EQ(values[1], "v2");
|
|
|
|
ASSERT_EQ(values[2], "v3");
|
|
|
|
ASSERT_EQ(values[4], "v5");
|
2017-08-18 18:40:36 +00:00
|
|
|
// four kv pairs * two bytes per value
|
|
|
|
ASSERT_EQ(8, (int)get_perf_context()->multiget_read_bytes);
|
2017-02-27 20:19:23 +00:00
|
|
|
|
|
|
|
ASSERT_OK(s[0]);
|
|
|
|
ASSERT_OK(s[1]);
|
|
|
|
ASSERT_OK(s[2]);
|
|
|
|
ASSERT_TRUE(s[3].IsNotFound());
|
|
|
|
ASSERT_OK(s[4]);
|
|
|
|
ASSERT_TRUE(s[5].IsNotFound());
|
2017-08-18 18:40:36 +00:00
|
|
|
SetPerfLevel(kDisable);
|
2017-02-27 20:19:23 +00:00
|
|
|
} while (ChangeCompactOptions());
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(DBBasicTest, MultiGetEmpty) {
|
|
|
|
do {
|
|
|
|
CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
|
|
|
|
// Empty Key Set
|
|
|
|
std::vector<Slice> keys;
|
|
|
|
std::vector<std::string> values;
|
|
|
|
std::vector<ColumnFamilyHandle*> cfs;
|
|
|
|
std::vector<Status> s = db_->MultiGet(ReadOptions(), cfs, keys, &values);
|
|
|
|
ASSERT_EQ(s.size(), 0U);
|
|
|
|
|
|
|
|
// Empty Database, Empty Key Set
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.create_if_missing = true;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
CreateAndReopenWithCF({"pikachu"}, options);
|
|
|
|
s = db_->MultiGet(ReadOptions(), cfs, keys, &values);
|
|
|
|
ASSERT_EQ(s.size(), 0U);
|
|
|
|
|
|
|
|
// Empty Database, Search for Keys
|
|
|
|
keys.resize(2);
|
|
|
|
keys[0] = "a";
|
|
|
|
keys[1] = "b";
|
|
|
|
cfs.push_back(handles_[0]);
|
|
|
|
cfs.push_back(handles_[1]);
|
|
|
|
s = db_->MultiGet(ReadOptions(), cfs, keys, &values);
|
|
|
|
ASSERT_EQ(static_cast<int>(s.size()), 2);
|
|
|
|
ASSERT_TRUE(s[0].IsNotFound() && s[1].IsNotFound());
|
|
|
|
} while (ChangeCompactOptions());
|
|
|
|
}
|
|
|
|
|
2021-12-10 16:12:09 +00:00
|
|
|
class DBBlockChecksumTest : public DBBasicTest,
|
|
|
|
public testing::WithParamInterface<uint32_t> {};
|
|
|
|
|
|
|
|
INSTANTIATE_TEST_CASE_P(FormatVersions, DBBlockChecksumTest,
|
|
|
|
testing::ValuesIn(test::kFooterFormatVersionsToTest));
|
|
|
|
|
|
|
|
TEST_P(DBBlockChecksumTest, BlockChecksumTest) {
|
2017-02-27 20:19:23 +00:00
|
|
|
BlockBasedTableOptions table_options;
|
2021-12-10 16:12:09 +00:00
|
|
|
table_options.format_version = GetParam();
|
2017-02-27 20:19:23 +00:00
|
|
|
Options options = CurrentOptions();
|
2017-08-24 02:31:40 +00:00
|
|
|
const int kNumPerFile = 2;
|
|
|
|
|
Implement XXH3 block checksum type (#9069)
Summary:
XXH3 - latest hash function that is extremely fast on large
data, easily faster than crc32c on most any x86_64 hardware. In
integrating this hash function, I have handled the compression type byte
in a non-standard way to avoid using the streaming API (extra data
movement and active code size because of hash function complexity). This
approach got a thumbs-up from Yann Collet.
Existing functionality change:
* reject bad ChecksumType in options with InvalidArgument
This change split off from https://github.com/facebook/rocksdb/issues/9058 because context-aware checksum is
likely to be handled through different configuration than ChecksumType.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9069
Test Plan:
tests updated, and substantially expanded. Unit tests now check
that we don't accidentally change the values generated by the checksum
algorithms ("schema test") and that we properly handle
invalid/unrecognized checksum types in options or in file footer.
DBTestBase::ChangeOptions (etc.) updated from two to one configuration
changing from default CRC32c ChecksumType. The point of this test code
is to detect possible interactions among features, and the likelihood of
some bad interaction being detected by including configurations other
than XXH3 and CRC32c--and then not detected by stress/crash test--is
extremely low.
Stress/crash test also updated (manual run long enough to see it accepts
new checksum type). db_bench also updated for microbenchmarking
checksums.
### Performance microbenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
./db_bench -benchmarks=crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3
crc32c : 0.200 micros/op 5005220 ops/sec; 19551.6 MB/s (4096 per op)
xxhash : 0.807 micros/op 1238408 ops/sec; 4837.5 MB/s (4096 per op)
xxhash64 : 0.421 micros/op 2376514 ops/sec; 9283.3 MB/s (4096 per op)
xxh3 : 0.171 micros/op 5858391 ops/sec; 22884.3 MB/s (4096 per op)
crc32c : 0.206 micros/op 4859566 ops/sec; 18982.7 MB/s (4096 per op)
xxhash : 0.793 micros/op 1260850 ops/sec; 4925.2 MB/s (4096 per op)
xxhash64 : 0.410 micros/op 2439182 ops/sec; 9528.1 MB/s (4096 per op)
xxh3 : 0.161 micros/op 6202872 ops/sec; 24230.0 MB/s (4096 per op)
crc32c : 0.203 micros/op 4924686 ops/sec; 19237.1 MB/s (4096 per op)
xxhash : 0.839 micros/op 1192388 ops/sec; 4657.8 MB/s (4096 per op)
xxhash64 : 0.424 micros/op 2357391 ops/sec; 9208.6 MB/s (4096 per op)
xxh3 : 0.162 micros/op 6182678 ops/sec; 24151.1 MB/s (4096 per op)
As you can see, especially once warmed up, xxh3 is fastest.
### Performance macrobenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
Test
for I in `seq 1 50`; do for CHK in 0 1 2 3 4; do TEST_TMPDIR=/dev/shm/rocksdb$CHK ./db_bench -benchmarks=fillseq -memtablerep=vector -allow_concurrent_memtable_write=false -num=30000000 -checksum_type=$CHK 2>&1 | grep 'micros/op' | tee -a results-$CHK & done; wait; done
Results (ops/sec)
for FILE in results*; do echo -n "$FILE "; awk '{ s += $5; c++; } END { print 1.0 * s / c; }' < $FILE; done
results-0 252118 # kNoChecksum
results-1 251588 # kCRC32c
results-2 251863 # kxxHash
results-3 252016 # kxxHash64
results-4 252038 # kXXH3
Reviewed By: mrambacher
Differential Revision: D31905249
Pulled By: pdillinger
fbshipit-source-id: cb9b998ebe2523fc7c400eedf62124a78bf4b4d1
2021-10-29 05:13:47 +00:00
|
|
|
const auto algs = GetSupportedChecksums();
|
|
|
|
const int algs_size = static_cast<int>(algs.size());
|
|
|
|
|
2017-08-24 02:31:40 +00:00
|
|
|
// generate one table with each type of checksum
|
Implement XXH3 block checksum type (#9069)
Summary:
XXH3 - latest hash function that is extremely fast on large
data, easily faster than crc32c on most any x86_64 hardware. In
integrating this hash function, I have handled the compression type byte
in a non-standard way to avoid using the streaming API (extra data
movement and active code size because of hash function complexity). This
approach got a thumbs-up from Yann Collet.
Existing functionality change:
* reject bad ChecksumType in options with InvalidArgument
This change split off from https://github.com/facebook/rocksdb/issues/9058 because context-aware checksum is
likely to be handled through different configuration than ChecksumType.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9069
Test Plan:
tests updated, and substantially expanded. Unit tests now check
that we don't accidentally change the values generated by the checksum
algorithms ("schema test") and that we properly handle
invalid/unrecognized checksum types in options or in file footer.
DBTestBase::ChangeOptions (etc.) updated from two to one configuration
changing from default CRC32c ChecksumType. The point of this test code
is to detect possible interactions among features, and the likelihood of
some bad interaction being detected by including configurations other
than XXH3 and CRC32c--and then not detected by stress/crash test--is
extremely low.
Stress/crash test also updated (manual run long enough to see it accepts
new checksum type). db_bench also updated for microbenchmarking
checksums.
### Performance microbenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
./db_bench -benchmarks=crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3
crc32c : 0.200 micros/op 5005220 ops/sec; 19551.6 MB/s (4096 per op)
xxhash : 0.807 micros/op 1238408 ops/sec; 4837.5 MB/s (4096 per op)
xxhash64 : 0.421 micros/op 2376514 ops/sec; 9283.3 MB/s (4096 per op)
xxh3 : 0.171 micros/op 5858391 ops/sec; 22884.3 MB/s (4096 per op)
crc32c : 0.206 micros/op 4859566 ops/sec; 18982.7 MB/s (4096 per op)
xxhash : 0.793 micros/op 1260850 ops/sec; 4925.2 MB/s (4096 per op)
xxhash64 : 0.410 micros/op 2439182 ops/sec; 9528.1 MB/s (4096 per op)
xxh3 : 0.161 micros/op 6202872 ops/sec; 24230.0 MB/s (4096 per op)
crc32c : 0.203 micros/op 4924686 ops/sec; 19237.1 MB/s (4096 per op)
xxhash : 0.839 micros/op 1192388 ops/sec; 4657.8 MB/s (4096 per op)
xxhash64 : 0.424 micros/op 2357391 ops/sec; 9208.6 MB/s (4096 per op)
xxh3 : 0.162 micros/op 6182678 ops/sec; 24151.1 MB/s (4096 per op)
As you can see, especially once warmed up, xxh3 is fastest.
### Performance macrobenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
Test
for I in `seq 1 50`; do for CHK in 0 1 2 3 4; do TEST_TMPDIR=/dev/shm/rocksdb$CHK ./db_bench -benchmarks=fillseq -memtablerep=vector -allow_concurrent_memtable_write=false -num=30000000 -checksum_type=$CHK 2>&1 | grep 'micros/op' | tee -a results-$CHK & done; wait; done
Results (ops/sec)
for FILE in results*; do echo -n "$FILE "; awk '{ s += $5; c++; } END { print 1.0 * s / c; }' < $FILE; done
results-0 252118 # kNoChecksum
results-1 251588 # kCRC32c
results-2 251863 # kxxHash
results-3 252016 # kxxHash64
results-4 252038 # kXXH3
Reviewed By: mrambacher
Differential Revision: D31905249
Pulled By: pdillinger
fbshipit-source-id: cb9b998ebe2523fc7c400eedf62124a78bf4b4d1
2021-10-29 05:13:47 +00:00
|
|
|
for (int i = 0; i < algs_size; ++i) {
|
|
|
|
table_options.checksum = algs[i];
|
2017-08-24 02:31:40 +00:00
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
Reopen(options);
|
|
|
|
for (int j = 0; j < kNumPerFile; ++j) {
|
|
|
|
ASSERT_OK(Put(Key(i * kNumPerFile + j), Key(i * kNumPerFile + j)));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
2017-02-27 20:19:23 +00:00
|
|
|
|
2019-10-25 00:14:27 +00:00
|
|
|
// with each valid checksum type setting...
|
Implement XXH3 block checksum type (#9069)
Summary:
XXH3 - latest hash function that is extremely fast on large
data, easily faster than crc32c on most any x86_64 hardware. In
integrating this hash function, I have handled the compression type byte
in a non-standard way to avoid using the streaming API (extra data
movement and active code size because of hash function complexity). This
approach got a thumbs-up from Yann Collet.
Existing functionality change:
* reject bad ChecksumType in options with InvalidArgument
This change split off from https://github.com/facebook/rocksdb/issues/9058 because context-aware checksum is
likely to be handled through different configuration than ChecksumType.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9069
Test Plan:
tests updated, and substantially expanded. Unit tests now check
that we don't accidentally change the values generated by the checksum
algorithms ("schema test") and that we properly handle
invalid/unrecognized checksum types in options or in file footer.
DBTestBase::ChangeOptions (etc.) updated from two to one configuration
changing from default CRC32c ChecksumType. The point of this test code
is to detect possible interactions among features, and the likelihood of
some bad interaction being detected by including configurations other
than XXH3 and CRC32c--and then not detected by stress/crash test--is
extremely low.
Stress/crash test also updated (manual run long enough to see it accepts
new checksum type). db_bench also updated for microbenchmarking
checksums.
### Performance microbenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
./db_bench -benchmarks=crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3
crc32c : 0.200 micros/op 5005220 ops/sec; 19551.6 MB/s (4096 per op)
xxhash : 0.807 micros/op 1238408 ops/sec; 4837.5 MB/s (4096 per op)
xxhash64 : 0.421 micros/op 2376514 ops/sec; 9283.3 MB/s (4096 per op)
xxh3 : 0.171 micros/op 5858391 ops/sec; 22884.3 MB/s (4096 per op)
crc32c : 0.206 micros/op 4859566 ops/sec; 18982.7 MB/s (4096 per op)
xxhash : 0.793 micros/op 1260850 ops/sec; 4925.2 MB/s (4096 per op)
xxhash64 : 0.410 micros/op 2439182 ops/sec; 9528.1 MB/s (4096 per op)
xxh3 : 0.161 micros/op 6202872 ops/sec; 24230.0 MB/s (4096 per op)
crc32c : 0.203 micros/op 4924686 ops/sec; 19237.1 MB/s (4096 per op)
xxhash : 0.839 micros/op 1192388 ops/sec; 4657.8 MB/s (4096 per op)
xxhash64 : 0.424 micros/op 2357391 ops/sec; 9208.6 MB/s (4096 per op)
xxh3 : 0.162 micros/op 6182678 ops/sec; 24151.1 MB/s (4096 per op)
As you can see, especially once warmed up, xxh3 is fastest.
### Performance macrobenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
Test
for I in `seq 1 50`; do for CHK in 0 1 2 3 4; do TEST_TMPDIR=/dev/shm/rocksdb$CHK ./db_bench -benchmarks=fillseq -memtablerep=vector -allow_concurrent_memtable_write=false -num=30000000 -checksum_type=$CHK 2>&1 | grep 'micros/op' | tee -a results-$CHK & done; wait; done
Results (ops/sec)
for FILE in results*; do echo -n "$FILE "; awk '{ s += $5; c++; } END { print 1.0 * s / c; }' < $FILE; done
results-0 252118 # kNoChecksum
results-1 251588 # kCRC32c
results-2 251863 # kxxHash
results-3 252016 # kxxHash64
results-4 252038 # kXXH3
Reviewed By: mrambacher
Differential Revision: D31905249
Pulled By: pdillinger
fbshipit-source-id: cb9b998ebe2523fc7c400eedf62124a78bf4b4d1
2021-10-29 05:13:47 +00:00
|
|
|
for (int i = 0; i < algs_size; ++i) {
|
|
|
|
table_options.checksum = algs[i];
|
2017-08-24 02:31:40 +00:00
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
Reopen(options);
|
2019-10-25 00:14:27 +00:00
|
|
|
// verify every type of checksum (should be regardless of that setting)
|
Implement XXH3 block checksum type (#9069)
Summary:
XXH3 - latest hash function that is extremely fast on large
data, easily faster than crc32c on most any x86_64 hardware. In
integrating this hash function, I have handled the compression type byte
in a non-standard way to avoid using the streaming API (extra data
movement and active code size because of hash function complexity). This
approach got a thumbs-up from Yann Collet.
Existing functionality change:
* reject bad ChecksumType in options with InvalidArgument
This change split off from https://github.com/facebook/rocksdb/issues/9058 because context-aware checksum is
likely to be handled through different configuration than ChecksumType.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9069
Test Plan:
tests updated, and substantially expanded. Unit tests now check
that we don't accidentally change the values generated by the checksum
algorithms ("schema test") and that we properly handle
invalid/unrecognized checksum types in options or in file footer.
DBTestBase::ChangeOptions (etc.) updated from two to one configuration
changing from default CRC32c ChecksumType. The point of this test code
is to detect possible interactions among features, and the likelihood of
some bad interaction being detected by including configurations other
than XXH3 and CRC32c--and then not detected by stress/crash test--is
extremely low.
Stress/crash test also updated (manual run long enough to see it accepts
new checksum type). db_bench also updated for microbenchmarking
checksums.
### Performance microbenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
./db_bench -benchmarks=crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3
crc32c : 0.200 micros/op 5005220 ops/sec; 19551.6 MB/s (4096 per op)
xxhash : 0.807 micros/op 1238408 ops/sec; 4837.5 MB/s (4096 per op)
xxhash64 : 0.421 micros/op 2376514 ops/sec; 9283.3 MB/s (4096 per op)
xxh3 : 0.171 micros/op 5858391 ops/sec; 22884.3 MB/s (4096 per op)
crc32c : 0.206 micros/op 4859566 ops/sec; 18982.7 MB/s (4096 per op)
xxhash : 0.793 micros/op 1260850 ops/sec; 4925.2 MB/s (4096 per op)
xxhash64 : 0.410 micros/op 2439182 ops/sec; 9528.1 MB/s (4096 per op)
xxh3 : 0.161 micros/op 6202872 ops/sec; 24230.0 MB/s (4096 per op)
crc32c : 0.203 micros/op 4924686 ops/sec; 19237.1 MB/s (4096 per op)
xxhash : 0.839 micros/op 1192388 ops/sec; 4657.8 MB/s (4096 per op)
xxhash64 : 0.424 micros/op 2357391 ops/sec; 9208.6 MB/s (4096 per op)
xxh3 : 0.162 micros/op 6182678 ops/sec; 24151.1 MB/s (4096 per op)
As you can see, especially once warmed up, xxh3 is fastest.
### Performance macrobenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
Test
for I in `seq 1 50`; do for CHK in 0 1 2 3 4; do TEST_TMPDIR=/dev/shm/rocksdb$CHK ./db_bench -benchmarks=fillseq -memtablerep=vector -allow_concurrent_memtable_write=false -num=30000000 -checksum_type=$CHK 2>&1 | grep 'micros/op' | tee -a results-$CHK & done; wait; done
Results (ops/sec)
for FILE in results*; do echo -n "$FILE "; awk '{ s += $5; c++; } END { print 1.0 * s / c; }' < $FILE; done
results-0 252118 # kNoChecksum
results-1 251588 # kCRC32c
results-2 251863 # kxxHash
results-3 252016 # kxxHash64
results-4 252038 # kXXH3
Reviewed By: mrambacher
Differential Revision: D31905249
Pulled By: pdillinger
fbshipit-source-id: cb9b998ebe2523fc7c400eedf62124a78bf4b4d1
2021-10-29 05:13:47 +00:00
|
|
|
for (int j = 0; j < algs_size * kNumPerFile; ++j) {
|
2017-08-24 02:31:40 +00:00
|
|
|
ASSERT_EQ(Key(j), Get(Key(j)));
|
|
|
|
}
|
|
|
|
}
|
Implement XXH3 block checksum type (#9069)
Summary:
XXH3 - latest hash function that is extremely fast on large
data, easily faster than crc32c on most any x86_64 hardware. In
integrating this hash function, I have handled the compression type byte
in a non-standard way to avoid using the streaming API (extra data
movement and active code size because of hash function complexity). This
approach got a thumbs-up from Yann Collet.
Existing functionality change:
* reject bad ChecksumType in options with InvalidArgument
This change split off from https://github.com/facebook/rocksdb/issues/9058 because context-aware checksum is
likely to be handled through different configuration than ChecksumType.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9069
Test Plan:
tests updated, and substantially expanded. Unit tests now check
that we don't accidentally change the values generated by the checksum
algorithms ("schema test") and that we properly handle
invalid/unrecognized checksum types in options or in file footer.
DBTestBase::ChangeOptions (etc.) updated from two to one configuration
changing from default CRC32c ChecksumType. The point of this test code
is to detect possible interactions among features, and the likelihood of
some bad interaction being detected by including configurations other
than XXH3 and CRC32c--and then not detected by stress/crash test--is
extremely low.
Stress/crash test also updated (manual run long enough to see it accepts
new checksum type). db_bench also updated for microbenchmarking
checksums.
### Performance microbenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
./db_bench -benchmarks=crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3,crc32c,xxhash,xxhash64,xxh3
crc32c : 0.200 micros/op 5005220 ops/sec; 19551.6 MB/s (4096 per op)
xxhash : 0.807 micros/op 1238408 ops/sec; 4837.5 MB/s (4096 per op)
xxhash64 : 0.421 micros/op 2376514 ops/sec; 9283.3 MB/s (4096 per op)
xxh3 : 0.171 micros/op 5858391 ops/sec; 22884.3 MB/s (4096 per op)
crc32c : 0.206 micros/op 4859566 ops/sec; 18982.7 MB/s (4096 per op)
xxhash : 0.793 micros/op 1260850 ops/sec; 4925.2 MB/s (4096 per op)
xxhash64 : 0.410 micros/op 2439182 ops/sec; 9528.1 MB/s (4096 per op)
xxh3 : 0.161 micros/op 6202872 ops/sec; 24230.0 MB/s (4096 per op)
crc32c : 0.203 micros/op 4924686 ops/sec; 19237.1 MB/s (4096 per op)
xxhash : 0.839 micros/op 1192388 ops/sec; 4657.8 MB/s (4096 per op)
xxhash64 : 0.424 micros/op 2357391 ops/sec; 9208.6 MB/s (4096 per op)
xxh3 : 0.162 micros/op 6182678 ops/sec; 24151.1 MB/s (4096 per op)
As you can see, especially once warmed up, xxh3 is fastest.
### Performance macrobenchmark (PORTABLE=0 DEBUG_LEVEL=0, Broadwell processor)
Test
for I in `seq 1 50`; do for CHK in 0 1 2 3 4; do TEST_TMPDIR=/dev/shm/rocksdb$CHK ./db_bench -benchmarks=fillseq -memtablerep=vector -allow_concurrent_memtable_write=false -num=30000000 -checksum_type=$CHK 2>&1 | grep 'micros/op' | tee -a results-$CHK & done; wait; done
Results (ops/sec)
for FILE in results*; do echo -n "$FILE "; awk '{ s += $5; c++; } END { print 1.0 * s / c; }' < $FILE; done
results-0 252118 # kNoChecksum
results-1 251588 # kCRC32c
results-2 251863 # kxxHash
results-3 252016 # kxxHash64
results-4 252038 # kXXH3
Reviewed By: mrambacher
Differential Revision: D31905249
Pulled By: pdillinger
fbshipit-source-id: cb9b998ebe2523fc7c400eedf62124a78bf4b4d1
2021-10-29 05:13:47 +00:00
|
|
|
|
|
|
|
// Now test invalid checksum type
|
|
|
|
table_options.checksum = static_cast<ChecksumType>(123);
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
ASSERT_TRUE(TryReopen(options).IsInvalidArgument());
|
2017-02-27 20:19:23 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// On Windows you can have either memory mapped file or a file
|
|
|
|
// with unbuffered access. So this asserts and does not make
|
|
|
|
// sense to run
|
|
|
|
#ifndef OS_WIN
|
|
|
|
TEST_F(DBBasicTest, MmapAndBufferOptions) {
|
2017-06-26 23:52:06 +00:00
|
|
|
if (!IsMemoryMappedAccessSupported()) {
|
|
|
|
return;
|
|
|
|
}
|
2017-02-27 20:19:23 +00:00
|
|
|
Options options = CurrentOptions();
|
|
|
|
|
|
|
|
options.use_direct_reads = true;
|
|
|
|
options.allow_mmap_reads = true;
|
|
|
|
ASSERT_NOK(TryReopen(options));
|
|
|
|
|
|
|
|
// All other combinations are acceptable
|
|
|
|
options.use_direct_reads = false;
|
|
|
|
ASSERT_OK(TryReopen(options));
|
|
|
|
|
|
|
|
if (IsDirectIOSupported()) {
|
|
|
|
options.use_direct_reads = true;
|
|
|
|
options.allow_mmap_reads = false;
|
|
|
|
ASSERT_OK(TryReopen(options));
|
|
|
|
}
|
|
|
|
|
|
|
|
options.use_direct_reads = false;
|
|
|
|
ASSERT_OK(TryReopen(options));
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2018-01-16 18:57:56 +00:00
|
|
|
class TestEnv : public EnvWrapper {
|
2020-03-24 18:21:10 +00:00
|
|
|
public:
|
|
|
|
explicit TestEnv(Env* base_env) : EnvWrapper(base_env), close_count(0) {}
|
2022-01-05 00:44:54 +00:00
|
|
|
static const char* kClassName() { return "TestEnv"; }
|
|
|
|
const char* Name() const override { return kClassName(); }
|
2020-03-24 18:21:10 +00:00
|
|
|
|
|
|
|
class TestLogger : public Logger {
|
|
|
|
public:
|
|
|
|
using Logger::Logv;
|
|
|
|
explicit TestLogger(TestEnv* env_ptr) : Logger() { env = env_ptr; }
|
|
|
|
~TestLogger() override {
|
|
|
|
if (!closed_) {
|
2020-09-29 16:47:33 +00:00
|
|
|
CloseHelper().PermitUncheckedError();
|
2020-03-24 18:21:10 +00:00
|
|
|
}
|
2018-01-16 18:57:56 +00:00
|
|
|
}
|
2020-03-24 18:21:10 +00:00
|
|
|
void Logv(const char* /*format*/, va_list /*ap*/) override {}
|
|
|
|
|
|
|
|
protected:
|
|
|
|
Status CloseImpl() override { return CloseHelper(); }
|
2018-02-23 21:50:02 +00:00
|
|
|
|
2018-03-07 00:13:05 +00:00
|
|
|
private:
|
2020-03-24 18:21:10 +00:00
|
|
|
Status CloseHelper() {
|
|
|
|
env->CloseCountInc();
|
|
|
|
;
|
|
|
|
return Status::IOError();
|
|
|
|
}
|
|
|
|
TestEnv* env;
|
|
|
|
};
|
|
|
|
|
|
|
|
void CloseCountInc() { close_count++; }
|
|
|
|
|
|
|
|
int GetCloseCount() { return close_count; }
|
|
|
|
|
|
|
|
Status NewLogger(const std::string& /*fname*/,
|
|
|
|
std::shared_ptr<Logger>* result) override {
|
|
|
|
result->reset(new TestLogger(this));
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
int close_count;
|
2018-01-16 18:57:56 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
TEST_F(DBBasicTest, DBClose) {
|
|
|
|
Options options = GetDefaultOptions();
|
2018-07-14 00:18:39 +00:00
|
|
|
std::string dbname = test::PerThreadDBPath("db_close_test");
|
2018-01-16 18:57:56 +00:00
|
|
|
ASSERT_OK(DestroyDB(dbname, options));
|
|
|
|
|
|
|
|
DB* db = nullptr;
|
2019-10-08 00:47:19 +00:00
|
|
|
TestEnv* env = new TestEnv(env_);
|
|
|
|
std::unique_ptr<TestEnv> local_env_guard(env);
|
2018-01-16 18:57:56 +00:00
|
|
|
options.create_if_missing = true;
|
2018-02-23 21:50:02 +00:00
|
|
|
options.env = env;
|
2018-01-16 18:57:56 +00:00
|
|
|
Status s = DB::Open(options, dbname, &db);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_TRUE(db != nullptr);
|
|
|
|
|
|
|
|
s = db->Close();
|
2018-02-23 21:50:02 +00:00
|
|
|
ASSERT_EQ(env->GetCloseCount(), 1);
|
|
|
|
ASSERT_EQ(s, Status::IOError());
|
2018-01-16 18:57:56 +00:00
|
|
|
|
|
|
|
delete db;
|
2018-02-23 21:50:02 +00:00
|
|
|
ASSERT_EQ(env->GetCloseCount(), 1);
|
|
|
|
|
|
|
|
// Do not call DB::Close() and ensure our logger Close() still gets called
|
|
|
|
s = DB::Open(options, dbname, &db);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_TRUE(db != nullptr);
|
|
|
|
delete db;
|
|
|
|
ASSERT_EQ(env->GetCloseCount(), 2);
|
2018-01-16 18:57:56 +00:00
|
|
|
|
|
|
|
// Provide our own logger and ensure DB::Close() does not close it
|
2018-02-23 21:50:02 +00:00
|
|
|
options.info_log.reset(new TestEnv::TestLogger(env));
|
2018-01-16 18:57:56 +00:00
|
|
|
options.create_if_missing = false;
|
|
|
|
s = DB::Open(options, dbname, &db);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_TRUE(db != nullptr);
|
|
|
|
|
|
|
|
s = db->Close();
|
|
|
|
ASSERT_EQ(s, Status::OK());
|
2018-01-17 01:26:29 +00:00
|
|
|
delete db;
|
2018-02-23 21:50:02 +00:00
|
|
|
ASSERT_EQ(env->GetCloseCount(), 2);
|
|
|
|
options.info_log.reset();
|
|
|
|
ASSERT_EQ(env->GetCloseCount(), 3);
|
2018-01-16 18:57:56 +00:00
|
|
|
}
|
|
|
|
|
Explicitly closing all directory file descriptors (#10049)
Summary:
Currently, the DB directory file descriptor is left open until the deconstruction process (`DB::Close()` does not close the file descriptor). To verify this, comment out the lines between `db_ = nullptr` and `db_->Close()` (line 512, 513, 514, 515 in ldb_cmd.cc) to leak the ``db_'' object, build `ldb` tool and run
```
strace --trace=open,openat,close ./ldb --db=$TEST_TMPDIR --ignore_unknown_options put K1 V1 --create_if_missing
```
There is one directory file descriptor that is not closed in the strace log.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10049
Test Plan: Add a new unit test DBBasicTest.DBCloseAllDirectoryFDs: Open a database with different WAL directory and three different data directories, and all directory file descriptors should be closed after calling Close(). Explicitly call Close() after a directory file descriptor is not used so that the counter of directory open and close should be equivalent.
Reviewed By: ajkr, hx235
Differential Revision: D36722135
Pulled By: littlepig2013
fbshipit-source-id: 07bdc2abc417c6b30997b9bbef1f79aa757b21ff
2022-06-02 01:03:34 +00:00
|
|
|
TEST_F(DBBasicTest, DBCloseAllDirectoryFDs) {
|
|
|
|
Options options = GetDefaultOptions();
|
|
|
|
std::string dbname = test::PerThreadDBPath("db_close_all_dir_fds_test");
|
|
|
|
// Configure a specific WAL directory
|
|
|
|
options.wal_dir = dbname + "_wal_dir";
|
|
|
|
// Configure 3 different data directories
|
|
|
|
options.db_paths.emplace_back(dbname + "_1", 512 * 1024);
|
|
|
|
options.db_paths.emplace_back(dbname + "_2", 4 * 1024 * 1024);
|
|
|
|
options.db_paths.emplace_back(dbname + "_3", 1024 * 1024 * 1024);
|
|
|
|
|
|
|
|
ASSERT_OK(DestroyDB(dbname, options));
|
|
|
|
|
|
|
|
DB* db = nullptr;
|
|
|
|
std::unique_ptr<Env> env = NewCompositeEnv(
|
|
|
|
std::make_shared<CountedFileSystem>(FileSystem::Default()));
|
|
|
|
options.create_if_missing = true;
|
|
|
|
options.env = env.get();
|
|
|
|
Status s = DB::Open(options, dbname, &db);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_TRUE(db != nullptr);
|
|
|
|
|
|
|
|
// Explicitly close the database to ensure the open and close counter for
|
|
|
|
// directories are equivalent
|
|
|
|
s = db->Close();
|
|
|
|
auto* counted_fs =
|
|
|
|
options.env->GetFileSystem()->CheckedCast<CountedFileSystem>();
|
Fix serious FSDirectory use-after-Close bug (missing fsync) (#10460)
Summary:
TL;DR: due to a recent change, if you drop a column family,
often that DB will no longer fsync after writing new SST files
to remaining or new column families, which could lead to data
loss on power loss.
More bug detail:
The intent of https://github.com/facebook/rocksdb/issues/10049 was to Close FSDirectory objects at
DB::Close time rather than waiting for DB object destruction.
Unfortunately, it also closes shared FSDirectory objects on
DropColumnFamily (& destroy remaining handles), which can lead
to use-after-Close on FSDirectory shared with remaining column
families. Those "uses" are only Fsyncs (or redundant Closes). In
the default Posix filesystem, an Fsync on a closed FSDirectory is a
quiet no-op. Consequently (under most configurations), if you drop
a column family, that DB will no longer fsync after writing new SST
files to column families sharing the same directory (true under most
configurations).
More fix detail:
Basically, this removes unnecessary Close ops on destroying
ColumnFamilyData. We let `shared_ptr` take care of calling the
destructor at the right time. If the intent was to require Close be
called before destroying FSDirectory, that was not made clear by the
author of FileSystem and was not at all enforced by https://github.com/facebook/rocksdb/issues/10049, which
could have added `assert(fd_ == -1)` to `~PosixDirectory()` but did
not. To keep this fix simple, we relax the unit test for https://github.com/facebook/rocksdb/issues/10049 to allow
timely destruction of FSDirectory to suffice as Close (in
CountedFileSystem). Added a TODO to revisit that.
Also in this PR:
* Added a TODO to share FSDirectory instances between DB and its column
families. (Already shared among column families.)
* Made DB::Close attempt to close all its open FSDirectory objects even
if there is a failure in closing one. Also code clean-up around this
logic.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10460
Test Plan:
add an assert to check for use-after-Close. With that
existing tests can detect the misuse. With fix, tests pass (except noted
relaxing of unit test for https://github.com/facebook/rocksdb/issues/10049)
Reviewed By: ajkr
Differential Revision: D38357922
Pulled By: pdillinger
fbshipit-source-id: d42079cadbedf0a969f03389bf586b3b4e1f9137
2022-08-02 17:54:32 +00:00
|
|
|
ASSERT_TRUE(counted_fs != nullptr);
|
|
|
|
ASSERT_EQ(counted_fs->counters()->dir_opens,
|
|
|
|
counted_fs->counters()->dir_closes);
|
Explicitly closing all directory file descriptors (#10049)
Summary:
Currently, the DB directory file descriptor is left open until the deconstruction process (`DB::Close()` does not close the file descriptor). To verify this, comment out the lines between `db_ = nullptr` and `db_->Close()` (line 512, 513, 514, 515 in ldb_cmd.cc) to leak the ``db_'' object, build `ldb` tool and run
```
strace --trace=open,openat,close ./ldb --db=$TEST_TMPDIR --ignore_unknown_options put K1 V1 --create_if_missing
```
There is one directory file descriptor that is not closed in the strace log.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10049
Test Plan: Add a new unit test DBBasicTest.DBCloseAllDirectoryFDs: Open a database with different WAL directory and three different data directories, and all directory file descriptors should be closed after calling Close(). Explicitly call Close() after a directory file descriptor is not used so that the counter of directory open and close should be equivalent.
Reviewed By: ajkr, hx235
Differential Revision: D36722135
Pulled By: littlepig2013
fbshipit-source-id: 07bdc2abc417c6b30997b9bbef1f79aa757b21ff
2022-06-02 01:03:34 +00:00
|
|
|
ASSERT_OK(s);
|
|
|
|
delete db;
|
|
|
|
}
|
|
|
|
|
2018-02-05 21:48:25 +00:00
|
|
|
TEST_F(DBBasicTest, DBCloseFlushError) {
|
|
|
|
std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
|
2019-10-08 00:47:19 +00:00
|
|
|
new FaultInjectionTestEnv(env_));
|
2018-02-05 21:48:25 +00:00
|
|
|
Options options = GetDefaultOptions();
|
|
|
|
options.create_if_missing = true;
|
|
|
|
options.manual_wal_flush = true;
|
2020-03-24 18:21:10 +00:00
|
|
|
options.write_buffer_size = 100;
|
2018-02-05 21:48:25 +00:00
|
|
|
options.env = fault_injection_env.get();
|
|
|
|
|
|
|
|
Reopen(options);
|
|
|
|
ASSERT_OK(Put("key1", "value1"));
|
|
|
|
ASSERT_OK(Put("key2", "value2"));
|
|
|
|
ASSERT_OK(dbfull()->TEST_SwitchMemtable());
|
|
|
|
ASSERT_OK(Put("key3", "value3"));
|
|
|
|
fault_injection_env->SetFilesystemActive(false);
|
|
|
|
Status s = dbfull()->Close();
|
2021-10-19 03:31:32 +00:00
|
|
|
ASSERT_NE(s, Status::OK());
|
|
|
|
// retry should return the same error
|
|
|
|
s = dbfull()->Close();
|
|
|
|
ASSERT_NE(s, Status::OK());
|
2018-02-05 21:48:25 +00:00
|
|
|
fault_injection_env->SetFilesystemActive(true);
|
2021-10-19 03:31:32 +00:00
|
|
|
// retry close() is no-op even the system is back. Could be improved if
|
|
|
|
// Close() is retry-able: #9029
|
|
|
|
s = dbfull()->Close();
|
2018-02-05 21:48:25 +00:00
|
|
|
ASSERT_NE(s, Status::OK());
|
|
|
|
Destroy(options);
|
|
|
|
}
|
|
|
|
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
class DBMultiGetTestWithParam
|
|
|
|
: public DBBasicTest,
|
|
|
|
public testing::WithParamInterface<std::tuple<bool, bool>> {};
|
2019-11-12 21:51:18 +00:00
|
|
|
|
|
|
|
TEST_P(DBMultiGetTestWithParam, MultiGetMultiCF) {
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
#ifndef USE_COROUTINES
|
|
|
|
if (std::get<1>(GetParam())) {
|
|
|
|
ROCKSDB_GTEST_SKIP("This test requires coroutine support");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
#endif // USE_COROUTINES
|
2019-01-02 19:40:12 +00:00
|
|
|
Options options = CurrentOptions();
|
|
|
|
CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich",
|
|
|
|
"alyosha", "popovich"},
|
|
|
|
options);
|
2019-11-12 21:51:18 +00:00
|
|
|
// <CF, key, value> tuples
|
|
|
|
std::vector<std::tuple<int, std::string, std::string>> cf_kv_vec;
|
|
|
|
static const int num_keys = 24;
|
|
|
|
cf_kv_vec.reserve(num_keys);
|
|
|
|
|
|
|
|
for (int i = 0; i < num_keys; ++i) {
|
|
|
|
int cf = i / 3;
|
|
|
|
int cf_key = 1 % 3;
|
|
|
|
cf_kv_vec.emplace_back(std::make_tuple(
|
|
|
|
cf, "cf" + std::to_string(cf) + "_key_" + std::to_string(cf_key),
|
|
|
|
"cf" + std::to_string(cf) + "_val_" + std::to_string(cf_key)));
|
|
|
|
ASSERT_OK(Put(std::get<0>(cf_kv_vec[i]), std::get<1>(cf_kv_vec[i]),
|
|
|
|
std::get<2>(cf_kv_vec[i])));
|
2019-01-02 19:40:12 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
int get_sv_count = 0;
|
2020-07-03 02:24:25 +00:00
|
|
|
ROCKSDB_NAMESPACE::DBImpl* db = static_cast_with_check<DBImpl>(db_);
|
2020-02-20 20:07:53 +00:00
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
2019-01-02 19:40:12 +00:00
|
|
|
"DBImpl::MultiGet::AfterRefSV", [&](void* /*arg*/) {
|
|
|
|
if (++get_sv_count == 2) {
|
|
|
|
// After MultiGet refs a couple of CFs, flush all CFs so MultiGet
|
|
|
|
// is forced to repeat the process
|
2019-11-12 21:51:18 +00:00
|
|
|
for (int i = 0; i < num_keys; ++i) {
|
|
|
|
int cf = i / 3;
|
|
|
|
int cf_key = i % 8;
|
|
|
|
if (cf_key == 0) {
|
|
|
|
ASSERT_OK(Flush(cf));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Put(std::get<0>(cf_kv_vec[i]), std::get<1>(cf_kv_vec[i]),
|
|
|
|
std::get<2>(cf_kv_vec[i]) + "_2"));
|
2019-01-02 19:40:12 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
if (get_sv_count == 11) {
|
|
|
|
for (int i = 0; i < 8; ++i) {
|
2020-07-03 02:24:25 +00:00
|
|
|
auto* cfd = static_cast_with_check<ColumnFamilyHandleImpl>(
|
2019-01-02 19:40:12 +00:00
|
|
|
db->GetColumnFamilyHandle(i))
|
|
|
|
->cfd();
|
|
|
|
ASSERT_EQ(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVInUse);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
});
|
2020-02-20 20:07:53 +00:00
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
2019-01-02 19:40:12 +00:00
|
|
|
|
|
|
|
std::vector<int> cfs;
|
|
|
|
std::vector<std::string> keys;
|
|
|
|
std::vector<std::string> values;
|
|
|
|
|
2019-11-12 21:51:18 +00:00
|
|
|
for (int i = 0; i < num_keys; ++i) {
|
|
|
|
cfs.push_back(std::get<0>(cf_kv_vec[i]));
|
|
|
|
keys.push_back(std::get<1>(cf_kv_vec[i]));
|
2019-01-02 19:40:12 +00:00
|
|
|
}
|
|
|
|
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
values = MultiGet(cfs, keys, nullptr, std::get<0>(GetParam()),
|
|
|
|
std::get<1>(GetParam()));
|
2019-11-12 21:51:18 +00:00
|
|
|
ASSERT_EQ(values.size(), num_keys);
|
2019-01-02 19:40:12 +00:00
|
|
|
for (unsigned int j = 0; j < values.size(); ++j) {
|
2019-11-12 21:51:18 +00:00
|
|
|
ASSERT_EQ(values[j], std::get<2>(cf_kv_vec[j]) + "_2");
|
2019-01-02 19:40:12 +00:00
|
|
|
}
|
2019-11-12 21:51:18 +00:00
|
|
|
|
|
|
|
keys.clear();
|
|
|
|
cfs.clear();
|
|
|
|
cfs.push_back(std::get<0>(cf_kv_vec[0]));
|
|
|
|
keys.push_back(std::get<1>(cf_kv_vec[0]));
|
|
|
|
cfs.push_back(std::get<0>(cf_kv_vec[3]));
|
|
|
|
keys.push_back(std::get<1>(cf_kv_vec[3]));
|
|
|
|
cfs.push_back(std::get<0>(cf_kv_vec[4]));
|
|
|
|
keys.push_back(std::get<1>(cf_kv_vec[4]));
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
values = MultiGet(cfs, keys, nullptr, std::get<0>(GetParam()),
|
|
|
|
std::get<1>(GetParam()));
|
2019-11-12 21:51:18 +00:00
|
|
|
ASSERT_EQ(values[0], std::get<2>(cf_kv_vec[0]) + "_2");
|
|
|
|
ASSERT_EQ(values[1], std::get<2>(cf_kv_vec[3]) + "_2");
|
|
|
|
ASSERT_EQ(values[2], std::get<2>(cf_kv_vec[4]) + "_2");
|
|
|
|
|
|
|
|
keys.clear();
|
|
|
|
cfs.clear();
|
|
|
|
cfs.push_back(std::get<0>(cf_kv_vec[7]));
|
|
|
|
keys.push_back(std::get<1>(cf_kv_vec[7]));
|
|
|
|
cfs.push_back(std::get<0>(cf_kv_vec[6]));
|
|
|
|
keys.push_back(std::get<1>(cf_kv_vec[6]));
|
|
|
|
cfs.push_back(std::get<0>(cf_kv_vec[1]));
|
|
|
|
keys.push_back(std::get<1>(cf_kv_vec[1]));
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
values = MultiGet(cfs, keys, nullptr, std::get<0>(GetParam()),
|
|
|
|
std::get<1>(GetParam()));
|
2019-11-12 21:51:18 +00:00
|
|
|
ASSERT_EQ(values[0], std::get<2>(cf_kv_vec[7]) + "_2");
|
|
|
|
ASSERT_EQ(values[1], std::get<2>(cf_kv_vec[6]) + "_2");
|
|
|
|
ASSERT_EQ(values[2], std::get<2>(cf_kv_vec[1]) + "_2");
|
|
|
|
|
|
|
|
for (int cf = 0; cf < 8; ++cf) {
|
2020-07-03 02:24:25 +00:00
|
|
|
auto* cfd =
|
|
|
|
static_cast_with_check<ColumnFamilyHandleImpl>(
|
|
|
|
static_cast_with_check<DBImpl>(db_)->GetColumnFamilyHandle(cf))
|
|
|
|
->cfd();
|
2019-01-02 19:40:12 +00:00
|
|
|
ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVInUse);
|
|
|
|
ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVObsolete);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-11-12 21:51:18 +00:00
|
|
|
TEST_P(DBMultiGetTestWithParam, MultiGetMultiCFMutex) {
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
#ifndef USE_COROUTINES
|
|
|
|
if (std::get<1>(GetParam())) {
|
|
|
|
ROCKSDB_GTEST_SKIP("This test requires coroutine support");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
#endif // USE_COROUTINES
|
2019-01-02 19:40:12 +00:00
|
|
|
Options options = CurrentOptions();
|
|
|
|
CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich",
|
|
|
|
"alyosha", "popovich"},
|
|
|
|
options);
|
|
|
|
|
|
|
|
for (int i = 0; i < 8; ++i) {
|
|
|
|
ASSERT_OK(Put(i, "cf" + std::to_string(i) + "_key",
|
|
|
|
"cf" + std::to_string(i) + "_val"));
|
|
|
|
}
|
|
|
|
|
|
|
|
int get_sv_count = 0;
|
|
|
|
int retries = 0;
|
|
|
|
bool last_try = false;
|
2020-02-20 20:07:53 +00:00
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
2019-01-02 19:40:12 +00:00
|
|
|
"DBImpl::MultiGet::LastTry", [&](void* /*arg*/) {
|
|
|
|
last_try = true;
|
2020-02-20 20:07:53 +00:00
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
|
2019-01-02 19:40:12 +00:00
|
|
|
});
|
2020-02-20 20:07:53 +00:00
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
2019-01-02 19:40:12 +00:00
|
|
|
"DBImpl::MultiGet::AfterRefSV", [&](void* /*arg*/) {
|
|
|
|
if (last_try) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if (++get_sv_count == 2) {
|
|
|
|
++retries;
|
|
|
|
get_sv_count = 0;
|
|
|
|
for (int i = 0; i < 8; ++i) {
|
|
|
|
ASSERT_OK(Flush(i));
|
|
|
|
ASSERT_OK(Put(
|
|
|
|
i, "cf" + std::to_string(i) + "_key",
|
|
|
|
"cf" + std::to_string(i) + "_val" + std::to_string(retries)));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
});
|
2020-02-20 20:07:53 +00:00
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
2019-01-02 19:40:12 +00:00
|
|
|
|
|
|
|
std::vector<int> cfs;
|
|
|
|
std::vector<std::string> keys;
|
|
|
|
std::vector<std::string> values;
|
|
|
|
|
|
|
|
for (int i = 0; i < 8; ++i) {
|
|
|
|
cfs.push_back(i);
|
|
|
|
keys.push_back("cf" + std::to_string(i) + "_key");
|
|
|
|
}
|
|
|
|
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
values = MultiGet(cfs, keys, nullptr, std::get<0>(GetParam()),
|
|
|
|
std::get<1>(GetParam()));
|
2019-01-02 19:40:12 +00:00
|
|
|
ASSERT_TRUE(last_try);
|
|
|
|
ASSERT_EQ(values.size(), 8);
|
|
|
|
for (unsigned int j = 0; j < values.size(); ++j) {
|
|
|
|
ASSERT_EQ(values[j],
|
|
|
|
"cf" + std::to_string(j) + "_val" + std::to_string(retries));
|
|
|
|
}
|
|
|
|
for (int i = 0; i < 8; ++i) {
|
2020-07-03 02:24:25 +00:00
|
|
|
auto* cfd =
|
|
|
|
static_cast_with_check<ColumnFamilyHandleImpl>(
|
|
|
|
static_cast_with_check<DBImpl>(db_)->GetColumnFamilyHandle(i))
|
|
|
|
->cfd();
|
2019-01-02 19:40:12 +00:00
|
|
|
ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVInUse);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-11-12 21:51:18 +00:00
|
|
|
TEST_P(DBMultiGetTestWithParam, MultiGetMultiCFSnapshot) {
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
#ifndef USE_COROUTINES
|
|
|
|
if (std::get<1>(GetParam())) {
|
|
|
|
ROCKSDB_GTEST_SKIP("This test requires coroutine support");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
#endif // USE_COROUTINES
|
2019-01-02 19:40:12 +00:00
|
|
|
Options options = CurrentOptions();
|
|
|
|
CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich",
|
|
|
|
"alyosha", "popovich"},
|
|
|
|
options);
|
|
|
|
|
|
|
|
for (int i = 0; i < 8; ++i) {
|
|
|
|
ASSERT_OK(Put(i, "cf" + std::to_string(i) + "_key",
|
|
|
|
"cf" + std::to_string(i) + "_val"));
|
|
|
|
}
|
|
|
|
|
|
|
|
int get_sv_count = 0;
|
2020-07-03 02:24:25 +00:00
|
|
|
ROCKSDB_NAMESPACE::DBImpl* db = static_cast_with_check<DBImpl>(db_);
|
2020-02-20 20:07:53 +00:00
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
2019-01-02 19:40:12 +00:00
|
|
|
"DBImpl::MultiGet::AfterRefSV", [&](void* /*arg*/) {
|
|
|
|
if (++get_sv_count == 2) {
|
|
|
|
for (int i = 0; i < 8; ++i) {
|
|
|
|
ASSERT_OK(Flush(i));
|
|
|
|
ASSERT_OK(Put(i, "cf" + std::to_string(i) + "_key",
|
|
|
|
"cf" + std::to_string(i) + "_val2"));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (get_sv_count == 8) {
|
|
|
|
for (int i = 0; i < 8; ++i) {
|
2020-07-03 02:24:25 +00:00
|
|
|
auto* cfd = static_cast_with_check<ColumnFamilyHandleImpl>(
|
2019-01-02 19:40:12 +00:00
|
|
|
db->GetColumnFamilyHandle(i))
|
|
|
|
->cfd();
|
|
|
|
ASSERT_TRUE(
|
|
|
|
(cfd->TEST_GetLocalSV()->Get() == SuperVersion::kSVInUse) ||
|
|
|
|
(cfd->TEST_GetLocalSV()->Get() == SuperVersion::kSVObsolete));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
});
|
2020-02-20 20:07:53 +00:00
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
2019-01-02 19:40:12 +00:00
|
|
|
|
|
|
|
std::vector<int> cfs;
|
|
|
|
std::vector<std::string> keys;
|
|
|
|
std::vector<std::string> values;
|
|
|
|
|
|
|
|
for (int i = 0; i < 8; ++i) {
|
|
|
|
cfs.push_back(i);
|
|
|
|
keys.push_back("cf" + std::to_string(i) + "_key");
|
|
|
|
}
|
|
|
|
|
|
|
|
const Snapshot* snapshot = db_->GetSnapshot();
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
values = MultiGet(cfs, keys, snapshot, std::get<0>(GetParam()),
|
|
|
|
std::get<1>(GetParam()));
|
2019-01-02 19:40:12 +00:00
|
|
|
db_->ReleaseSnapshot(snapshot);
|
|
|
|
ASSERT_EQ(values.size(), 8);
|
|
|
|
for (unsigned int j = 0; j < values.size(); ++j) {
|
|
|
|
ASSERT_EQ(values[j], "cf" + std::to_string(j) + "_val");
|
|
|
|
}
|
|
|
|
for (int i = 0; i < 8; ++i) {
|
2020-07-03 02:24:25 +00:00
|
|
|
auto* cfd =
|
|
|
|
static_cast_with_check<ColumnFamilyHandleImpl>(
|
|
|
|
static_cast_with_check<DBImpl>(db_)->GetColumnFamilyHandle(i))
|
|
|
|
->cfd();
|
2019-01-02 19:40:12 +00:00
|
|
|
ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVInUse);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-08-06 23:26:04 +00:00
|
|
|
TEST_P(DBMultiGetTestWithParam, MultiGetMultiCFUnsorted) {
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
#ifndef USE_COROUTINES
|
|
|
|
if (std::get<1>(GetParam())) {
|
|
|
|
ROCKSDB_GTEST_SKIP("This test requires coroutine support");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
#endif // USE_COROUTINES
|
2021-08-06 23:26:04 +00:00
|
|
|
Options options = CurrentOptions();
|
|
|
|
CreateAndReopenWithCF({"one", "two"}, options);
|
|
|
|
|
|
|
|
ASSERT_OK(Put(1, "foo", "bar"));
|
|
|
|
ASSERT_OK(Put(2, "baz", "xyz"));
|
|
|
|
ASSERT_OK(Put(1, "abc", "def"));
|
|
|
|
|
|
|
|
// Note: keys for the same CF do not form a consecutive range
|
|
|
|
std::vector<int> cfs{1, 2, 1};
|
|
|
|
std::vector<std::string> keys{"foo", "baz", "abc"};
|
|
|
|
std::vector<std::string> values;
|
|
|
|
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
values = MultiGet(cfs, keys, /* snapshot */ nullptr,
|
|
|
|
/* batched */ std::get<0>(GetParam()),
|
|
|
|
/* async */ std::get<1>(GetParam()));
|
2021-08-06 23:26:04 +00:00
|
|
|
|
|
|
|
ASSERT_EQ(values.size(), 3);
|
|
|
|
ASSERT_EQ(values[0], "bar");
|
|
|
|
ASSERT_EQ(values[1], "xyz");
|
|
|
|
ASSERT_EQ(values[2], "def");
|
|
|
|
}
|
|
|
|
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
TEST_P(DBMultiGetTestWithParam, MultiGetBatchedSimpleUnsorted) {
|
|
|
|
#ifndef USE_COROUTINES
|
|
|
|
if (std::get<1>(GetParam())) {
|
|
|
|
ROCKSDB_GTEST_SKIP("This test requires coroutine support");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
#endif // USE_COROUTINES
|
|
|
|
// Skip for unbatched MultiGet
|
|
|
|
if (!std::get<0>(GetParam())) {
|
2022-05-31 20:02:50 +00:00
|
|
|
ROCKSDB_GTEST_BYPASS("This test is only for batched MultiGet");
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
return;
|
|
|
|
}
|
2019-04-15 18:32:31 +00:00
|
|
|
do {
|
|
|
|
CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
|
|
|
|
SetPerfLevel(kEnableCount);
|
|
|
|
ASSERT_OK(Put(1, "k1", "v1"));
|
|
|
|
ASSERT_OK(Put(1, "k2", "v2"));
|
|
|
|
ASSERT_OK(Put(1, "k3", "v3"));
|
|
|
|
ASSERT_OK(Put(1, "k4", "v4"));
|
|
|
|
ASSERT_OK(Delete(1, "k4"));
|
|
|
|
ASSERT_OK(Put(1, "k5", "v5"));
|
|
|
|
ASSERT_OK(Delete(1, "no_key"));
|
|
|
|
|
|
|
|
get_perf_context()->Reset();
|
|
|
|
|
|
|
|
std::vector<Slice> keys({"no_key", "k5", "k4", "k3", "k2", "k1"});
|
|
|
|
std::vector<PinnableSlice> values(keys.size());
|
|
|
|
std::vector<ColumnFamilyHandle*> cfs(keys.size(), handles_[1]);
|
|
|
|
std::vector<Status> s(keys.size());
|
|
|
|
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
ReadOptions ro;
|
|
|
|
ro.async_io = std::get<1>(GetParam());
|
|
|
|
db_->MultiGet(ro, handles_[1], keys.size(), keys.data(), values.data(),
|
|
|
|
s.data(), false);
|
2019-04-15 18:32:31 +00:00
|
|
|
|
|
|
|
ASSERT_EQ(values.size(), keys.size());
|
|
|
|
ASSERT_EQ(std::string(values[5].data(), values[5].size()), "v1");
|
|
|
|
ASSERT_EQ(std::string(values[4].data(), values[4].size()), "v2");
|
|
|
|
ASSERT_EQ(std::string(values[3].data(), values[3].size()), "v3");
|
|
|
|
ASSERT_EQ(std::string(values[1].data(), values[1].size()), "v5");
|
|
|
|
// four kv pairs * two bytes per value
|
|
|
|
ASSERT_EQ(8, (int)get_perf_context()->multiget_read_bytes);
|
|
|
|
|
|
|
|
ASSERT_TRUE(s[0].IsNotFound());
|
|
|
|
ASSERT_OK(s[1]);
|
|
|
|
ASSERT_TRUE(s[2].IsNotFound());
|
|
|
|
ASSERT_OK(s[3]);
|
|
|
|
ASSERT_OK(s[4]);
|
|
|
|
ASSERT_OK(s[5]);
|
|
|
|
|
|
|
|
SetPerfLevel(kDisable);
|
|
|
|
} while (ChangeCompactOptions());
|
|
|
|
}
|
|
|
|
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
TEST_P(DBMultiGetTestWithParam, MultiGetBatchedSortedMultiFile) {
|
|
|
|
#ifndef USE_COROUTINES
|
|
|
|
if (std::get<1>(GetParam())) {
|
|
|
|
ROCKSDB_GTEST_SKIP("This test requires coroutine support");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
#endif // USE_COROUTINES
|
|
|
|
// Skip for unbatched MultiGet
|
|
|
|
if (!std::get<0>(GetParam())) {
|
2022-05-31 20:02:50 +00:00
|
|
|
ROCKSDB_GTEST_BYPASS("This test is only for batched MultiGet");
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
return;
|
|
|
|
}
|
2019-04-15 18:32:31 +00:00
|
|
|
do {
|
|
|
|
CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
|
|
|
|
SetPerfLevel(kEnableCount);
|
2020-05-07 22:39:49 +00:00
|
|
|
// To expand the power of this test, generate > 1 table file and
|
|
|
|
// mix with memtable
|
2019-04-15 18:32:31 +00:00
|
|
|
ASSERT_OK(Put(1, "k1", "v1"));
|
|
|
|
ASSERT_OK(Put(1, "k2", "v2"));
|
2020-12-24 00:54:05 +00:00
|
|
|
ASSERT_OK(Flush(1));
|
2019-04-15 18:32:31 +00:00
|
|
|
ASSERT_OK(Put(1, "k3", "v3"));
|
|
|
|
ASSERT_OK(Put(1, "k4", "v4"));
|
2020-12-24 00:54:05 +00:00
|
|
|
ASSERT_OK(Flush(1));
|
2019-04-15 18:32:31 +00:00
|
|
|
ASSERT_OK(Delete(1, "k4"));
|
|
|
|
ASSERT_OK(Put(1, "k5", "v5"));
|
|
|
|
ASSERT_OK(Delete(1, "no_key"));
|
|
|
|
|
|
|
|
get_perf_context()->Reset();
|
|
|
|
|
|
|
|
std::vector<Slice> keys({"k1", "k2", "k3", "k4", "k5", "no_key"});
|
|
|
|
std::vector<PinnableSlice> values(keys.size());
|
|
|
|
std::vector<ColumnFamilyHandle*> cfs(keys.size(), handles_[1]);
|
|
|
|
std::vector<Status> s(keys.size());
|
|
|
|
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
ReadOptions ro;
|
|
|
|
ro.async_io = std::get<1>(GetParam());
|
|
|
|
db_->MultiGet(ro, handles_[1], keys.size(), keys.data(), values.data(),
|
|
|
|
s.data(), true);
|
2019-04-15 18:32:31 +00:00
|
|
|
|
|
|
|
ASSERT_EQ(values.size(), keys.size());
|
|
|
|
ASSERT_EQ(std::string(values[0].data(), values[0].size()), "v1");
|
|
|
|
ASSERT_EQ(std::string(values[1].data(), values[1].size()), "v2");
|
|
|
|
ASSERT_EQ(std::string(values[2].data(), values[2].size()), "v3");
|
|
|
|
ASSERT_EQ(std::string(values[4].data(), values[4].size()), "v5");
|
|
|
|
// four kv pairs * two bytes per value
|
|
|
|
ASSERT_EQ(8, (int)get_perf_context()->multiget_read_bytes);
|
|
|
|
|
|
|
|
ASSERT_OK(s[0]);
|
|
|
|
ASSERT_OK(s[1]);
|
|
|
|
ASSERT_OK(s[2]);
|
|
|
|
ASSERT_TRUE(s[3].IsNotFound());
|
|
|
|
ASSERT_OK(s[4]);
|
|
|
|
ASSERT_TRUE(s[5].IsNotFound());
|
|
|
|
|
|
|
|
SetPerfLevel(kDisable);
|
2020-05-07 22:39:49 +00:00
|
|
|
} while (ChangeOptions());
|
2019-04-15 18:32:31 +00:00
|
|
|
}
|
|
|
|
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
TEST_P(DBMultiGetTestWithParam, MultiGetBatchedDuplicateKeys) {
|
|
|
|
#ifndef USE_COROUTINES
|
|
|
|
if (std::get<1>(GetParam())) {
|
|
|
|
ROCKSDB_GTEST_SKIP("This test requires coroutine support");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
#endif // USE_COROUTINES
|
|
|
|
// Skip for unbatched MultiGet
|
|
|
|
if (!std::get<0>(GetParam())) {
|
2022-05-31 20:02:50 +00:00
|
|
|
ROCKSDB_GTEST_BYPASS("This test is only for batched MultiGet");
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
return;
|
|
|
|
}
|
2020-06-08 23:08:31 +00:00
|
|
|
Options opts = CurrentOptions();
|
|
|
|
opts.merge_operator = MergeOperators::CreateStringAppendOperator();
|
|
|
|
CreateAndReopenWithCF({"pikachu"}, opts);
|
|
|
|
SetPerfLevel(kEnableCount);
|
|
|
|
// To expand the power of this test, generate > 1 table file and
|
|
|
|
// mix with memtable
|
|
|
|
ASSERT_OK(Merge(1, "k1", "v1"));
|
|
|
|
ASSERT_OK(Merge(1, "k2", "v2"));
|
2020-12-24 00:54:05 +00:00
|
|
|
ASSERT_OK(Flush(1));
|
2020-06-08 23:08:31 +00:00
|
|
|
MoveFilesToLevel(2, 1);
|
|
|
|
ASSERT_OK(Merge(1, "k3", "v3"));
|
|
|
|
ASSERT_OK(Merge(1, "k4", "v4"));
|
2020-12-24 00:54:05 +00:00
|
|
|
ASSERT_OK(Flush(1));
|
2020-06-08 23:08:31 +00:00
|
|
|
MoveFilesToLevel(2, 1);
|
|
|
|
ASSERT_OK(Merge(1, "k4", "v4_2"));
|
|
|
|
ASSERT_OK(Merge(1, "k6", "v6"));
|
2020-12-24 00:54:05 +00:00
|
|
|
ASSERT_OK(Flush(1));
|
2020-06-08 23:08:31 +00:00
|
|
|
MoveFilesToLevel(2, 1);
|
|
|
|
ASSERT_OK(Merge(1, "k7", "v7"));
|
|
|
|
ASSERT_OK(Merge(1, "k8", "v8"));
|
2020-12-24 00:54:05 +00:00
|
|
|
ASSERT_OK(Flush(1));
|
2020-06-08 23:08:31 +00:00
|
|
|
MoveFilesToLevel(2, 1);
|
|
|
|
|
|
|
|
get_perf_context()->Reset();
|
|
|
|
|
|
|
|
std::vector<Slice> keys({"k8", "k8", "k8", "k4", "k4", "k1", "k3"});
|
|
|
|
std::vector<PinnableSlice> values(keys.size());
|
|
|
|
std::vector<ColumnFamilyHandle*> cfs(keys.size(), handles_[1]);
|
|
|
|
std::vector<Status> s(keys.size());
|
|
|
|
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
ReadOptions ro;
|
|
|
|
ro.async_io = std::get<1>(GetParam());
|
|
|
|
db_->MultiGet(ro, handles_[1], keys.size(), keys.data(), values.data(),
|
|
|
|
s.data(), false);
|
2020-06-08 23:08:31 +00:00
|
|
|
|
|
|
|
ASSERT_EQ(values.size(), keys.size());
|
|
|
|
ASSERT_EQ(std::string(values[0].data(), values[0].size()), "v8");
|
|
|
|
ASSERT_EQ(std::string(values[1].data(), values[1].size()), "v8");
|
|
|
|
ASSERT_EQ(std::string(values[2].data(), values[2].size()), "v8");
|
|
|
|
ASSERT_EQ(std::string(values[3].data(), values[3].size()), "v4,v4_2");
|
|
|
|
ASSERT_EQ(std::string(values[4].data(), values[4].size()), "v4,v4_2");
|
|
|
|
ASSERT_EQ(std::string(values[5].data(), values[5].size()), "v1");
|
|
|
|
ASSERT_EQ(std::string(values[6].data(), values[6].size()), "v3");
|
|
|
|
ASSERT_EQ(24, (int)get_perf_context()->multiget_read_bytes);
|
|
|
|
|
|
|
|
for (Status& status : s) {
|
|
|
|
ASSERT_OK(status);
|
|
|
|
}
|
|
|
|
|
|
|
|
SetPerfLevel(kDisable);
|
|
|
|
}
|
|
|
|
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
TEST_P(DBMultiGetTestWithParam, MultiGetBatchedMultiLevel) {
|
|
|
|
#ifndef USE_COROUTINES
|
|
|
|
if (std::get<1>(GetParam())) {
|
|
|
|
ROCKSDB_GTEST_SKIP("This test requires coroutine support");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
#endif // USE_COROUTINES
|
|
|
|
// Skip for unbatched MultiGet
|
|
|
|
if (!std::get<0>(GetParam())) {
|
2022-05-31 20:02:50 +00:00
|
|
|
ROCKSDB_GTEST_BYPASS("This test is only for batched MultiGet");
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
return;
|
|
|
|
}
|
Introduce a new MultiGet batching implementation (#5011)
Summary:
This PR introduces a new MultiGet() API, with the underlying implementation grouping keys based on SST file and batching lookups in a file. The reason for the new API is twofold - the definition allows callers to allocate storage for status and values on stack instead of std::vector, as well as return values as PinnableSlices in order to avoid copying, and it keeps the original MultiGet() implementation intact while we experiment with batching.
Batching is useful when there is some spatial locality to the keys being queries, as well as larger batch sizes. The main benefits are due to -
1. Fewer function calls, especially to BlockBasedTableReader::MultiGet() and FullFilterBlockReader::KeysMayMatch()
2. Bloom filter cachelines can be prefetched, hiding the cache miss latency
The next step is to optimize the binary searches in the level_storage_info, index blocks and data blocks, since we could reduce the number of key comparisons if the keys are relatively close to each other. The batching optimizations also need to be extended to other formats, such as PlainTable and filter formats. This also needs to be added to db_stress.
Benchmark results from db_bench for various batch size/locality of reference combinations are given below. Locality was simulated by offsetting the keys in a batch by a stride length. Each SST file is about 8.6MB uncompressed and key/value size is 16/100 uncompressed. To focus on the cpu benefit of batching, the runs were single threaded and bound to the same cpu to eliminate interference from other system events. The results show a 10-25% improvement in micros/op from smaller to larger batch sizes (4 - 32).
Batch Sizes
1 | 2 | 4 | 8 | 16 | 32
Random pattern (Stride length 0)
4.158 | 4.109 | 4.026 | 4.05 | 4.1 | 4.074 - Get
4.438 | 4.302 | 4.165 | 4.122 | 4.096 | 4.075 - MultiGet (no batching)
4.461 | 4.256 | 4.277 | 4.11 | 4.182 | 4.14 - MultiGet (w/ batching)
Good locality (Stride length 16)
4.048 | 3.659 | 3.248 | 2.99 | 2.84 | 2.753
4.429 | 3.728 | 3.406 | 3.053 | 2.911 | 2.781
4.452 | 3.45 | 2.833 | 2.451 | 2.233 | 2.135
Good locality (Stride length 256)
4.066 | 3.786 | 3.581 | 3.447 | 3.415 | 3.232
4.406 | 4.005 | 3.644 | 3.49 | 3.381 | 3.268
4.393 | 3.649 | 3.186 | 2.882 | 2.676 | 2.62
Medium locality (Stride length 4096)
4.012 | 3.922 | 3.768 | 3.61 | 3.582 | 3.555
4.364 | 4.057 | 3.791 | 3.65 | 3.57 | 3.465
4.479 | 3.758 | 3.316 | 3.077 | 2.959 | 2.891
dbbench command used (on a DB with 4 levels, 12 million keys)-
TEST_TMPDIR=/dev/shm numactl -C 10 ./db_bench.tmp -use_existing_db=true -benchmarks="readseq,multireadrandom" -write_buffer_size=4194304 -target_file_size_base=4194304 -max_bytes_for_level_base=16777216 -num=12000000 -reads=12000000 -duration=90 -threads=1 -compression_type=none -cache_size=4194304000 -batch_size=32 -disable_auto_compactions=true -bloom_bits=10 -cache_index_and_filter_blocks=true -pin_l0_filter_and_index_blocks_in_cache=true -multiread_batched=true -multiread_stride=4
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5011
Differential Revision: D14348703
Pulled By: anand1976
fbshipit-source-id: 774406dab3776d979c809522a67bedac6c17f84b
2019-04-11 21:24:09 +00:00
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.disable_auto_compactions = true;
|
|
|
|
Reopen(options);
|
|
|
|
int num_keys = 0;
|
|
|
|
|
|
|
|
for (int i = 0; i < 128; ++i) {
|
|
|
|
ASSERT_OK(Put("key_" + std::to_string(i), "val_l2_" + std::to_string(i)));
|
|
|
|
num_keys++;
|
|
|
|
if (num_keys == 8) {
|
2020-12-24 00:54:05 +00:00
|
|
|
ASSERT_OK(Flush());
|
Introduce a new MultiGet batching implementation (#5011)
Summary:
This PR introduces a new MultiGet() API, with the underlying implementation grouping keys based on SST file and batching lookups in a file. The reason for the new API is twofold - the definition allows callers to allocate storage for status and values on stack instead of std::vector, as well as return values as PinnableSlices in order to avoid copying, and it keeps the original MultiGet() implementation intact while we experiment with batching.
Batching is useful when there is some spatial locality to the keys being queries, as well as larger batch sizes. The main benefits are due to -
1. Fewer function calls, especially to BlockBasedTableReader::MultiGet() and FullFilterBlockReader::KeysMayMatch()
2. Bloom filter cachelines can be prefetched, hiding the cache miss latency
The next step is to optimize the binary searches in the level_storage_info, index blocks and data blocks, since we could reduce the number of key comparisons if the keys are relatively close to each other. The batching optimizations also need to be extended to other formats, such as PlainTable and filter formats. This also needs to be added to db_stress.
Benchmark results from db_bench for various batch size/locality of reference combinations are given below. Locality was simulated by offsetting the keys in a batch by a stride length. Each SST file is about 8.6MB uncompressed and key/value size is 16/100 uncompressed. To focus on the cpu benefit of batching, the runs were single threaded and bound to the same cpu to eliminate interference from other system events. The results show a 10-25% improvement in micros/op from smaller to larger batch sizes (4 - 32).
Batch Sizes
1 | 2 | 4 | 8 | 16 | 32
Random pattern (Stride length 0)
4.158 | 4.109 | 4.026 | 4.05 | 4.1 | 4.074 - Get
4.438 | 4.302 | 4.165 | 4.122 | 4.096 | 4.075 - MultiGet (no batching)
4.461 | 4.256 | 4.277 | 4.11 | 4.182 | 4.14 - MultiGet (w/ batching)
Good locality (Stride length 16)
4.048 | 3.659 | 3.248 | 2.99 | 2.84 | 2.753
4.429 | 3.728 | 3.406 | 3.053 | 2.911 | 2.781
4.452 | 3.45 | 2.833 | 2.451 | 2.233 | 2.135
Good locality (Stride length 256)
4.066 | 3.786 | 3.581 | 3.447 | 3.415 | 3.232
4.406 | 4.005 | 3.644 | 3.49 | 3.381 | 3.268
4.393 | 3.649 | 3.186 | 2.882 | 2.676 | 2.62
Medium locality (Stride length 4096)
4.012 | 3.922 | 3.768 | 3.61 | 3.582 | 3.555
4.364 | 4.057 | 3.791 | 3.65 | 3.57 | 3.465
4.479 | 3.758 | 3.316 | 3.077 | 2.959 | 2.891
dbbench command used (on a DB with 4 levels, 12 million keys)-
TEST_TMPDIR=/dev/shm numactl -C 10 ./db_bench.tmp -use_existing_db=true -benchmarks="readseq,multireadrandom" -write_buffer_size=4194304 -target_file_size_base=4194304 -max_bytes_for_level_base=16777216 -num=12000000 -reads=12000000 -duration=90 -threads=1 -compression_type=none -cache_size=4194304000 -batch_size=32 -disable_auto_compactions=true -bloom_bits=10 -cache_index_and_filter_blocks=true -pin_l0_filter_and_index_blocks_in_cache=true -multiread_batched=true -multiread_stride=4
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5011
Differential Revision: D14348703
Pulled By: anand1976
fbshipit-source-id: 774406dab3776d979c809522a67bedac6c17f84b
2019-04-11 21:24:09 +00:00
|
|
|
num_keys = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (num_keys > 0) {
|
2020-12-24 00:54:05 +00:00
|
|
|
ASSERT_OK(Flush());
|
Introduce a new MultiGet batching implementation (#5011)
Summary:
This PR introduces a new MultiGet() API, with the underlying implementation grouping keys based on SST file and batching lookups in a file. The reason for the new API is twofold - the definition allows callers to allocate storage for status and values on stack instead of std::vector, as well as return values as PinnableSlices in order to avoid copying, and it keeps the original MultiGet() implementation intact while we experiment with batching.
Batching is useful when there is some spatial locality to the keys being queries, as well as larger batch sizes. The main benefits are due to -
1. Fewer function calls, especially to BlockBasedTableReader::MultiGet() and FullFilterBlockReader::KeysMayMatch()
2. Bloom filter cachelines can be prefetched, hiding the cache miss latency
The next step is to optimize the binary searches in the level_storage_info, index blocks and data blocks, since we could reduce the number of key comparisons if the keys are relatively close to each other. The batching optimizations also need to be extended to other formats, such as PlainTable and filter formats. This also needs to be added to db_stress.
Benchmark results from db_bench for various batch size/locality of reference combinations are given below. Locality was simulated by offsetting the keys in a batch by a stride length. Each SST file is about 8.6MB uncompressed and key/value size is 16/100 uncompressed. To focus on the cpu benefit of batching, the runs were single threaded and bound to the same cpu to eliminate interference from other system events. The results show a 10-25% improvement in micros/op from smaller to larger batch sizes (4 - 32).
Batch Sizes
1 | 2 | 4 | 8 | 16 | 32
Random pattern (Stride length 0)
4.158 | 4.109 | 4.026 | 4.05 | 4.1 | 4.074 - Get
4.438 | 4.302 | 4.165 | 4.122 | 4.096 | 4.075 - MultiGet (no batching)
4.461 | 4.256 | 4.277 | 4.11 | 4.182 | 4.14 - MultiGet (w/ batching)
Good locality (Stride length 16)
4.048 | 3.659 | 3.248 | 2.99 | 2.84 | 2.753
4.429 | 3.728 | 3.406 | 3.053 | 2.911 | 2.781
4.452 | 3.45 | 2.833 | 2.451 | 2.233 | 2.135
Good locality (Stride length 256)
4.066 | 3.786 | 3.581 | 3.447 | 3.415 | 3.232
4.406 | 4.005 | 3.644 | 3.49 | 3.381 | 3.268
4.393 | 3.649 | 3.186 | 2.882 | 2.676 | 2.62
Medium locality (Stride length 4096)
4.012 | 3.922 | 3.768 | 3.61 | 3.582 | 3.555
4.364 | 4.057 | 3.791 | 3.65 | 3.57 | 3.465
4.479 | 3.758 | 3.316 | 3.077 | 2.959 | 2.891
dbbench command used (on a DB with 4 levels, 12 million keys)-
TEST_TMPDIR=/dev/shm numactl -C 10 ./db_bench.tmp -use_existing_db=true -benchmarks="readseq,multireadrandom" -write_buffer_size=4194304 -target_file_size_base=4194304 -max_bytes_for_level_base=16777216 -num=12000000 -reads=12000000 -duration=90 -threads=1 -compression_type=none -cache_size=4194304000 -batch_size=32 -disable_auto_compactions=true -bloom_bits=10 -cache_index_and_filter_blocks=true -pin_l0_filter_and_index_blocks_in_cache=true -multiread_batched=true -multiread_stride=4
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5011
Differential Revision: D14348703
Pulled By: anand1976
fbshipit-source-id: 774406dab3776d979c809522a67bedac6c17f84b
2019-04-11 21:24:09 +00:00
|
|
|
num_keys = 0;
|
|
|
|
}
|
|
|
|
MoveFilesToLevel(2);
|
|
|
|
|
|
|
|
for (int i = 0; i < 128; i += 3) {
|
|
|
|
ASSERT_OK(Put("key_" + std::to_string(i), "val_l1_" + std::to_string(i)));
|
|
|
|
num_keys++;
|
|
|
|
if (num_keys == 8) {
|
2020-12-24 00:54:05 +00:00
|
|
|
ASSERT_OK(Flush());
|
Introduce a new MultiGet batching implementation (#5011)
Summary:
This PR introduces a new MultiGet() API, with the underlying implementation grouping keys based on SST file and batching lookups in a file. The reason for the new API is twofold - the definition allows callers to allocate storage for status and values on stack instead of std::vector, as well as return values as PinnableSlices in order to avoid copying, and it keeps the original MultiGet() implementation intact while we experiment with batching.
Batching is useful when there is some spatial locality to the keys being queries, as well as larger batch sizes. The main benefits are due to -
1. Fewer function calls, especially to BlockBasedTableReader::MultiGet() and FullFilterBlockReader::KeysMayMatch()
2. Bloom filter cachelines can be prefetched, hiding the cache miss latency
The next step is to optimize the binary searches in the level_storage_info, index blocks and data blocks, since we could reduce the number of key comparisons if the keys are relatively close to each other. The batching optimizations also need to be extended to other formats, such as PlainTable and filter formats. This also needs to be added to db_stress.
Benchmark results from db_bench for various batch size/locality of reference combinations are given below. Locality was simulated by offsetting the keys in a batch by a stride length. Each SST file is about 8.6MB uncompressed and key/value size is 16/100 uncompressed. To focus on the cpu benefit of batching, the runs were single threaded and bound to the same cpu to eliminate interference from other system events. The results show a 10-25% improvement in micros/op from smaller to larger batch sizes (4 - 32).
Batch Sizes
1 | 2 | 4 | 8 | 16 | 32
Random pattern (Stride length 0)
4.158 | 4.109 | 4.026 | 4.05 | 4.1 | 4.074 - Get
4.438 | 4.302 | 4.165 | 4.122 | 4.096 | 4.075 - MultiGet (no batching)
4.461 | 4.256 | 4.277 | 4.11 | 4.182 | 4.14 - MultiGet (w/ batching)
Good locality (Stride length 16)
4.048 | 3.659 | 3.248 | 2.99 | 2.84 | 2.753
4.429 | 3.728 | 3.406 | 3.053 | 2.911 | 2.781
4.452 | 3.45 | 2.833 | 2.451 | 2.233 | 2.135
Good locality (Stride length 256)
4.066 | 3.786 | 3.581 | 3.447 | 3.415 | 3.232
4.406 | 4.005 | 3.644 | 3.49 | 3.381 | 3.268
4.393 | 3.649 | 3.186 | 2.882 | 2.676 | 2.62
Medium locality (Stride length 4096)
4.012 | 3.922 | 3.768 | 3.61 | 3.582 | 3.555
4.364 | 4.057 | 3.791 | 3.65 | 3.57 | 3.465
4.479 | 3.758 | 3.316 | 3.077 | 2.959 | 2.891
dbbench command used (on a DB with 4 levels, 12 million keys)-
TEST_TMPDIR=/dev/shm numactl -C 10 ./db_bench.tmp -use_existing_db=true -benchmarks="readseq,multireadrandom" -write_buffer_size=4194304 -target_file_size_base=4194304 -max_bytes_for_level_base=16777216 -num=12000000 -reads=12000000 -duration=90 -threads=1 -compression_type=none -cache_size=4194304000 -batch_size=32 -disable_auto_compactions=true -bloom_bits=10 -cache_index_and_filter_blocks=true -pin_l0_filter_and_index_blocks_in_cache=true -multiread_batched=true -multiread_stride=4
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5011
Differential Revision: D14348703
Pulled By: anand1976
fbshipit-source-id: 774406dab3776d979c809522a67bedac6c17f84b
2019-04-11 21:24:09 +00:00
|
|
|
num_keys = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (num_keys > 0) {
|
2020-12-24 00:54:05 +00:00
|
|
|
ASSERT_OK(Flush());
|
Introduce a new MultiGet batching implementation (#5011)
Summary:
This PR introduces a new MultiGet() API, with the underlying implementation grouping keys based on SST file and batching lookups in a file. The reason for the new API is twofold - the definition allows callers to allocate storage for status and values on stack instead of std::vector, as well as return values as PinnableSlices in order to avoid copying, and it keeps the original MultiGet() implementation intact while we experiment with batching.
Batching is useful when there is some spatial locality to the keys being queries, as well as larger batch sizes. The main benefits are due to -
1. Fewer function calls, especially to BlockBasedTableReader::MultiGet() and FullFilterBlockReader::KeysMayMatch()
2. Bloom filter cachelines can be prefetched, hiding the cache miss latency
The next step is to optimize the binary searches in the level_storage_info, index blocks and data blocks, since we could reduce the number of key comparisons if the keys are relatively close to each other. The batching optimizations also need to be extended to other formats, such as PlainTable and filter formats. This also needs to be added to db_stress.
Benchmark results from db_bench for various batch size/locality of reference combinations are given below. Locality was simulated by offsetting the keys in a batch by a stride length. Each SST file is about 8.6MB uncompressed and key/value size is 16/100 uncompressed. To focus on the cpu benefit of batching, the runs were single threaded and bound to the same cpu to eliminate interference from other system events. The results show a 10-25% improvement in micros/op from smaller to larger batch sizes (4 - 32).
Batch Sizes
1 | 2 | 4 | 8 | 16 | 32
Random pattern (Stride length 0)
4.158 | 4.109 | 4.026 | 4.05 | 4.1 | 4.074 - Get
4.438 | 4.302 | 4.165 | 4.122 | 4.096 | 4.075 - MultiGet (no batching)
4.461 | 4.256 | 4.277 | 4.11 | 4.182 | 4.14 - MultiGet (w/ batching)
Good locality (Stride length 16)
4.048 | 3.659 | 3.248 | 2.99 | 2.84 | 2.753
4.429 | 3.728 | 3.406 | 3.053 | 2.911 | 2.781
4.452 | 3.45 | 2.833 | 2.451 | 2.233 | 2.135
Good locality (Stride length 256)
4.066 | 3.786 | 3.581 | 3.447 | 3.415 | 3.232
4.406 | 4.005 | 3.644 | 3.49 | 3.381 | 3.268
4.393 | 3.649 | 3.186 | 2.882 | 2.676 | 2.62
Medium locality (Stride length 4096)
4.012 | 3.922 | 3.768 | 3.61 | 3.582 | 3.555
4.364 | 4.057 | 3.791 | 3.65 | 3.57 | 3.465
4.479 | 3.758 | 3.316 | 3.077 | 2.959 | 2.891
dbbench command used (on a DB with 4 levels, 12 million keys)-
TEST_TMPDIR=/dev/shm numactl -C 10 ./db_bench.tmp -use_existing_db=true -benchmarks="readseq,multireadrandom" -write_buffer_size=4194304 -target_file_size_base=4194304 -max_bytes_for_level_base=16777216 -num=12000000 -reads=12000000 -duration=90 -threads=1 -compression_type=none -cache_size=4194304000 -batch_size=32 -disable_auto_compactions=true -bloom_bits=10 -cache_index_and_filter_blocks=true -pin_l0_filter_and_index_blocks_in_cache=true -multiread_batched=true -multiread_stride=4
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5011
Differential Revision: D14348703
Pulled By: anand1976
fbshipit-source-id: 774406dab3776d979c809522a67bedac6c17f84b
2019-04-11 21:24:09 +00:00
|
|
|
num_keys = 0;
|
|
|
|
}
|
|
|
|
MoveFilesToLevel(1);
|
|
|
|
|
|
|
|
for (int i = 0; i < 128; i += 5) {
|
|
|
|
ASSERT_OK(Put("key_" + std::to_string(i), "val_l0_" + std::to_string(i)));
|
|
|
|
num_keys++;
|
|
|
|
if (num_keys == 8) {
|
2020-12-24 00:54:05 +00:00
|
|
|
ASSERT_OK(Flush());
|
Introduce a new MultiGet batching implementation (#5011)
Summary:
This PR introduces a new MultiGet() API, with the underlying implementation grouping keys based on SST file and batching lookups in a file. The reason for the new API is twofold - the definition allows callers to allocate storage for status and values on stack instead of std::vector, as well as return values as PinnableSlices in order to avoid copying, and it keeps the original MultiGet() implementation intact while we experiment with batching.
Batching is useful when there is some spatial locality to the keys being queries, as well as larger batch sizes. The main benefits are due to -
1. Fewer function calls, especially to BlockBasedTableReader::MultiGet() and FullFilterBlockReader::KeysMayMatch()
2. Bloom filter cachelines can be prefetched, hiding the cache miss latency
The next step is to optimize the binary searches in the level_storage_info, index blocks and data blocks, since we could reduce the number of key comparisons if the keys are relatively close to each other. The batching optimizations also need to be extended to other formats, such as PlainTable and filter formats. This also needs to be added to db_stress.
Benchmark results from db_bench for various batch size/locality of reference combinations are given below. Locality was simulated by offsetting the keys in a batch by a stride length. Each SST file is about 8.6MB uncompressed and key/value size is 16/100 uncompressed. To focus on the cpu benefit of batching, the runs were single threaded and bound to the same cpu to eliminate interference from other system events. The results show a 10-25% improvement in micros/op from smaller to larger batch sizes (4 - 32).
Batch Sizes
1 | 2 | 4 | 8 | 16 | 32
Random pattern (Stride length 0)
4.158 | 4.109 | 4.026 | 4.05 | 4.1 | 4.074 - Get
4.438 | 4.302 | 4.165 | 4.122 | 4.096 | 4.075 - MultiGet (no batching)
4.461 | 4.256 | 4.277 | 4.11 | 4.182 | 4.14 - MultiGet (w/ batching)
Good locality (Stride length 16)
4.048 | 3.659 | 3.248 | 2.99 | 2.84 | 2.753
4.429 | 3.728 | 3.406 | 3.053 | 2.911 | 2.781
4.452 | 3.45 | 2.833 | 2.451 | 2.233 | 2.135
Good locality (Stride length 256)
4.066 | 3.786 | 3.581 | 3.447 | 3.415 | 3.232
4.406 | 4.005 | 3.644 | 3.49 | 3.381 | 3.268
4.393 | 3.649 | 3.186 | 2.882 | 2.676 | 2.62
Medium locality (Stride length 4096)
4.012 | 3.922 | 3.768 | 3.61 | 3.582 | 3.555
4.364 | 4.057 | 3.791 | 3.65 | 3.57 | 3.465
4.479 | 3.758 | 3.316 | 3.077 | 2.959 | 2.891
dbbench command used (on a DB with 4 levels, 12 million keys)-
TEST_TMPDIR=/dev/shm numactl -C 10 ./db_bench.tmp -use_existing_db=true -benchmarks="readseq,multireadrandom" -write_buffer_size=4194304 -target_file_size_base=4194304 -max_bytes_for_level_base=16777216 -num=12000000 -reads=12000000 -duration=90 -threads=1 -compression_type=none -cache_size=4194304000 -batch_size=32 -disable_auto_compactions=true -bloom_bits=10 -cache_index_and_filter_blocks=true -pin_l0_filter_and_index_blocks_in_cache=true -multiread_batched=true -multiread_stride=4
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5011
Differential Revision: D14348703
Pulled By: anand1976
fbshipit-source-id: 774406dab3776d979c809522a67bedac6c17f84b
2019-04-11 21:24:09 +00:00
|
|
|
num_keys = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (num_keys > 0) {
|
2020-12-24 00:54:05 +00:00
|
|
|
ASSERT_OK(Flush());
|
Introduce a new MultiGet batching implementation (#5011)
Summary:
This PR introduces a new MultiGet() API, with the underlying implementation grouping keys based on SST file and batching lookups in a file. The reason for the new API is twofold - the definition allows callers to allocate storage for status and values on stack instead of std::vector, as well as return values as PinnableSlices in order to avoid copying, and it keeps the original MultiGet() implementation intact while we experiment with batching.
Batching is useful when there is some spatial locality to the keys being queries, as well as larger batch sizes. The main benefits are due to -
1. Fewer function calls, especially to BlockBasedTableReader::MultiGet() and FullFilterBlockReader::KeysMayMatch()
2. Bloom filter cachelines can be prefetched, hiding the cache miss latency
The next step is to optimize the binary searches in the level_storage_info, index blocks and data blocks, since we could reduce the number of key comparisons if the keys are relatively close to each other. The batching optimizations also need to be extended to other formats, such as PlainTable and filter formats. This also needs to be added to db_stress.
Benchmark results from db_bench for various batch size/locality of reference combinations are given below. Locality was simulated by offsetting the keys in a batch by a stride length. Each SST file is about 8.6MB uncompressed and key/value size is 16/100 uncompressed. To focus on the cpu benefit of batching, the runs were single threaded and bound to the same cpu to eliminate interference from other system events. The results show a 10-25% improvement in micros/op from smaller to larger batch sizes (4 - 32).
Batch Sizes
1 | 2 | 4 | 8 | 16 | 32
Random pattern (Stride length 0)
4.158 | 4.109 | 4.026 | 4.05 | 4.1 | 4.074 - Get
4.438 | 4.302 | 4.165 | 4.122 | 4.096 | 4.075 - MultiGet (no batching)
4.461 | 4.256 | 4.277 | 4.11 | 4.182 | 4.14 - MultiGet (w/ batching)
Good locality (Stride length 16)
4.048 | 3.659 | 3.248 | 2.99 | 2.84 | 2.753
4.429 | 3.728 | 3.406 | 3.053 | 2.911 | 2.781
4.452 | 3.45 | 2.833 | 2.451 | 2.233 | 2.135
Good locality (Stride length 256)
4.066 | 3.786 | 3.581 | 3.447 | 3.415 | 3.232
4.406 | 4.005 | 3.644 | 3.49 | 3.381 | 3.268
4.393 | 3.649 | 3.186 | 2.882 | 2.676 | 2.62
Medium locality (Stride length 4096)
4.012 | 3.922 | 3.768 | 3.61 | 3.582 | 3.555
4.364 | 4.057 | 3.791 | 3.65 | 3.57 | 3.465
4.479 | 3.758 | 3.316 | 3.077 | 2.959 | 2.891
dbbench command used (on a DB with 4 levels, 12 million keys)-
TEST_TMPDIR=/dev/shm numactl -C 10 ./db_bench.tmp -use_existing_db=true -benchmarks="readseq,multireadrandom" -write_buffer_size=4194304 -target_file_size_base=4194304 -max_bytes_for_level_base=16777216 -num=12000000 -reads=12000000 -duration=90 -threads=1 -compression_type=none -cache_size=4194304000 -batch_size=32 -disable_auto_compactions=true -bloom_bits=10 -cache_index_and_filter_blocks=true -pin_l0_filter_and_index_blocks_in_cache=true -multiread_batched=true -multiread_stride=4
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5011
Differential Revision: D14348703
Pulled By: anand1976
fbshipit-source-id: 774406dab3776d979c809522a67bedac6c17f84b
2019-04-11 21:24:09 +00:00
|
|
|
num_keys = 0;
|
|
|
|
}
|
2019-04-12 17:42:06 +00:00
|
|
|
ASSERT_EQ(0, num_keys);
|
Introduce a new MultiGet batching implementation (#5011)
Summary:
This PR introduces a new MultiGet() API, with the underlying implementation grouping keys based on SST file and batching lookups in a file. The reason for the new API is twofold - the definition allows callers to allocate storage for status and values on stack instead of std::vector, as well as return values as PinnableSlices in order to avoid copying, and it keeps the original MultiGet() implementation intact while we experiment with batching.
Batching is useful when there is some spatial locality to the keys being queries, as well as larger batch sizes. The main benefits are due to -
1. Fewer function calls, especially to BlockBasedTableReader::MultiGet() and FullFilterBlockReader::KeysMayMatch()
2. Bloom filter cachelines can be prefetched, hiding the cache miss latency
The next step is to optimize the binary searches in the level_storage_info, index blocks and data blocks, since we could reduce the number of key comparisons if the keys are relatively close to each other. The batching optimizations also need to be extended to other formats, such as PlainTable and filter formats. This also needs to be added to db_stress.
Benchmark results from db_bench for various batch size/locality of reference combinations are given below. Locality was simulated by offsetting the keys in a batch by a stride length. Each SST file is about 8.6MB uncompressed and key/value size is 16/100 uncompressed. To focus on the cpu benefit of batching, the runs were single threaded and bound to the same cpu to eliminate interference from other system events. The results show a 10-25% improvement in micros/op from smaller to larger batch sizes (4 - 32).
Batch Sizes
1 | 2 | 4 | 8 | 16 | 32
Random pattern (Stride length 0)
4.158 | 4.109 | 4.026 | 4.05 | 4.1 | 4.074 - Get
4.438 | 4.302 | 4.165 | 4.122 | 4.096 | 4.075 - MultiGet (no batching)
4.461 | 4.256 | 4.277 | 4.11 | 4.182 | 4.14 - MultiGet (w/ batching)
Good locality (Stride length 16)
4.048 | 3.659 | 3.248 | 2.99 | 2.84 | 2.753
4.429 | 3.728 | 3.406 | 3.053 | 2.911 | 2.781
4.452 | 3.45 | 2.833 | 2.451 | 2.233 | 2.135
Good locality (Stride length 256)
4.066 | 3.786 | 3.581 | 3.447 | 3.415 | 3.232
4.406 | 4.005 | 3.644 | 3.49 | 3.381 | 3.268
4.393 | 3.649 | 3.186 | 2.882 | 2.676 | 2.62
Medium locality (Stride length 4096)
4.012 | 3.922 | 3.768 | 3.61 | 3.582 | 3.555
4.364 | 4.057 | 3.791 | 3.65 | 3.57 | 3.465
4.479 | 3.758 | 3.316 | 3.077 | 2.959 | 2.891
dbbench command used (on a DB with 4 levels, 12 million keys)-
TEST_TMPDIR=/dev/shm numactl -C 10 ./db_bench.tmp -use_existing_db=true -benchmarks="readseq,multireadrandom" -write_buffer_size=4194304 -target_file_size_base=4194304 -max_bytes_for_level_base=16777216 -num=12000000 -reads=12000000 -duration=90 -threads=1 -compression_type=none -cache_size=4194304000 -batch_size=32 -disable_auto_compactions=true -bloom_bits=10 -cache_index_and_filter_blocks=true -pin_l0_filter_and_index_blocks_in_cache=true -multiread_batched=true -multiread_stride=4
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5011
Differential Revision: D14348703
Pulled By: anand1976
fbshipit-source-id: 774406dab3776d979c809522a67bedac6c17f84b
2019-04-11 21:24:09 +00:00
|
|
|
|
|
|
|
for (int i = 0; i < 128; i += 9) {
|
|
|
|
ASSERT_OK(Put("key_" + std::to_string(i), "val_mem_" + std::to_string(i)));
|
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<std::string> keys;
|
|
|
|
std::vector<std::string> values;
|
|
|
|
|
|
|
|
for (int i = 64; i < 80; ++i) {
|
|
|
|
keys.push_back("key_" + std::to_string(i));
|
|
|
|
}
|
|
|
|
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
values = MultiGet(keys, nullptr, std::get<1>(GetParam()));
|
Introduce a new MultiGet batching implementation (#5011)
Summary:
This PR introduces a new MultiGet() API, with the underlying implementation grouping keys based on SST file and batching lookups in a file. The reason for the new API is twofold - the definition allows callers to allocate storage for status and values on stack instead of std::vector, as well as return values as PinnableSlices in order to avoid copying, and it keeps the original MultiGet() implementation intact while we experiment with batching.
Batching is useful when there is some spatial locality to the keys being queries, as well as larger batch sizes. The main benefits are due to -
1. Fewer function calls, especially to BlockBasedTableReader::MultiGet() and FullFilterBlockReader::KeysMayMatch()
2. Bloom filter cachelines can be prefetched, hiding the cache miss latency
The next step is to optimize the binary searches in the level_storage_info, index blocks and data blocks, since we could reduce the number of key comparisons if the keys are relatively close to each other. The batching optimizations also need to be extended to other formats, such as PlainTable and filter formats. This also needs to be added to db_stress.
Benchmark results from db_bench for various batch size/locality of reference combinations are given below. Locality was simulated by offsetting the keys in a batch by a stride length. Each SST file is about 8.6MB uncompressed and key/value size is 16/100 uncompressed. To focus on the cpu benefit of batching, the runs were single threaded and bound to the same cpu to eliminate interference from other system events. The results show a 10-25% improvement in micros/op from smaller to larger batch sizes (4 - 32).
Batch Sizes
1 | 2 | 4 | 8 | 16 | 32
Random pattern (Stride length 0)
4.158 | 4.109 | 4.026 | 4.05 | 4.1 | 4.074 - Get
4.438 | 4.302 | 4.165 | 4.122 | 4.096 | 4.075 - MultiGet (no batching)
4.461 | 4.256 | 4.277 | 4.11 | 4.182 | 4.14 - MultiGet (w/ batching)
Good locality (Stride length 16)
4.048 | 3.659 | 3.248 | 2.99 | 2.84 | 2.753
4.429 | 3.728 | 3.406 | 3.053 | 2.911 | 2.781
4.452 | 3.45 | 2.833 | 2.451 | 2.233 | 2.135
Good locality (Stride length 256)
4.066 | 3.786 | 3.581 | 3.447 | 3.415 | 3.232
4.406 | 4.005 | 3.644 | 3.49 | 3.381 | 3.268
4.393 | 3.649 | 3.186 | 2.882 | 2.676 | 2.62
Medium locality (Stride length 4096)
4.012 | 3.922 | 3.768 | 3.61 | 3.582 | 3.555
4.364 | 4.057 | 3.791 | 3.65 | 3.57 | 3.465
4.479 | 3.758 | 3.316 | 3.077 | 2.959 | 2.891
dbbench command used (on a DB with 4 levels, 12 million keys)-
TEST_TMPDIR=/dev/shm numactl -C 10 ./db_bench.tmp -use_existing_db=true -benchmarks="readseq,multireadrandom" -write_buffer_size=4194304 -target_file_size_base=4194304 -max_bytes_for_level_base=16777216 -num=12000000 -reads=12000000 -duration=90 -threads=1 -compression_type=none -cache_size=4194304000 -batch_size=32 -disable_auto_compactions=true -bloom_bits=10 -cache_index_and_filter_blocks=true -pin_l0_filter_and_index_blocks_in_cache=true -multiread_batched=true -multiread_stride=4
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5011
Differential Revision: D14348703
Pulled By: anand1976
fbshipit-source-id: 774406dab3776d979c809522a67bedac6c17f84b
2019-04-11 21:24:09 +00:00
|
|
|
ASSERT_EQ(values.size(), 16);
|
|
|
|
for (unsigned int j = 0; j < values.size(); ++j) {
|
|
|
|
int key = j + 64;
|
|
|
|
if (key % 9 == 0) {
|
|
|
|
ASSERT_EQ(values[j], "val_mem_" + std::to_string(key));
|
|
|
|
} else if (key % 5 == 0) {
|
|
|
|
ASSERT_EQ(values[j], "val_l0_" + std::to_string(key));
|
|
|
|
} else if (key % 3 == 0) {
|
|
|
|
ASSERT_EQ(values[j], "val_l1_" + std::to_string(key));
|
|
|
|
} else {
|
|
|
|
ASSERT_EQ(values[j], "val_l2_" + std::to_string(key));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2019-06-06 06:07:28 +00:00
|
|
|
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
TEST_P(DBMultiGetTestWithParam, MultiGetBatchedMultiLevelMerge) {
|
|
|
|
#ifndef USE_COROUTINES
|
|
|
|
if (std::get<1>(GetParam())) {
|
|
|
|
ROCKSDB_GTEST_SKIP("This test requires coroutine support");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
#endif // USE_COROUTINES
|
|
|
|
// Skip for unbatched MultiGet
|
|
|
|
if (!std::get<0>(GetParam())) {
|
2022-05-31 20:02:50 +00:00
|
|
|
ROCKSDB_GTEST_BYPASS("This test is only for batched MultiGet");
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
return;
|
|
|
|
}
|
2020-02-12 01:25:10 +00:00
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.disable_auto_compactions = true;
|
|
|
|
options.merge_operator = MergeOperators::CreateStringAppendOperator();
|
|
|
|
BlockBasedTableOptions bbto;
|
|
|
|
bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(bbto));
|
|
|
|
Reopen(options);
|
|
|
|
int num_keys = 0;
|
|
|
|
|
|
|
|
for (int i = 0; i < 128; ++i) {
|
|
|
|
ASSERT_OK(Put("key_" + std::to_string(i), "val_l2_" + std::to_string(i)));
|
|
|
|
num_keys++;
|
|
|
|
if (num_keys == 8) {
|
2020-12-24 00:54:05 +00:00
|
|
|
ASSERT_OK(Flush());
|
2020-02-12 01:25:10 +00:00
|
|
|
num_keys = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (num_keys > 0) {
|
2020-12-24 00:54:05 +00:00
|
|
|
ASSERT_OK(Flush());
|
2020-02-12 01:25:10 +00:00
|
|
|
num_keys = 0;
|
|
|
|
}
|
|
|
|
MoveFilesToLevel(2);
|
|
|
|
|
|
|
|
for (int i = 0; i < 128; i += 3) {
|
|
|
|
ASSERT_OK(Merge("key_" + std::to_string(i), "val_l1_" + std::to_string(i)));
|
|
|
|
num_keys++;
|
|
|
|
if (num_keys == 8) {
|
2020-12-24 00:54:05 +00:00
|
|
|
ASSERT_OK(Flush());
|
2020-02-12 01:25:10 +00:00
|
|
|
num_keys = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (num_keys > 0) {
|
2020-12-24 00:54:05 +00:00
|
|
|
ASSERT_OK(Flush());
|
2020-02-12 01:25:10 +00:00
|
|
|
num_keys = 0;
|
|
|
|
}
|
|
|
|
MoveFilesToLevel(1);
|
|
|
|
|
|
|
|
for (int i = 0; i < 128; i += 5) {
|
|
|
|
ASSERT_OK(Merge("key_" + std::to_string(i), "val_l0_" + std::to_string(i)));
|
|
|
|
num_keys++;
|
|
|
|
if (num_keys == 8) {
|
2020-12-24 00:54:05 +00:00
|
|
|
ASSERT_OK(Flush());
|
2020-02-12 01:25:10 +00:00
|
|
|
num_keys = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (num_keys > 0) {
|
2020-12-24 00:54:05 +00:00
|
|
|
ASSERT_OK(Flush());
|
2020-02-12 01:25:10 +00:00
|
|
|
num_keys = 0;
|
|
|
|
}
|
|
|
|
ASSERT_EQ(0, num_keys);
|
|
|
|
|
|
|
|
for (int i = 0; i < 128; i += 9) {
|
2020-03-24 18:21:10 +00:00
|
|
|
ASSERT_OK(
|
|
|
|
Merge("key_" + std::to_string(i), "val_mem_" + std::to_string(i)));
|
2020-02-12 01:25:10 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<std::string> keys;
|
|
|
|
std::vector<std::string> values;
|
|
|
|
|
|
|
|
for (int i = 32; i < 80; ++i) {
|
|
|
|
keys.push_back("key_" + std::to_string(i));
|
|
|
|
}
|
|
|
|
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
values = MultiGet(keys, nullptr, std::get<1>(GetParam()));
|
2020-02-12 01:25:10 +00:00
|
|
|
ASSERT_EQ(values.size(), keys.size());
|
|
|
|
for (unsigned int j = 0; j < 48; ++j) {
|
|
|
|
int key = j + 32;
|
|
|
|
std::string value;
|
|
|
|
value.append("val_l2_" + std::to_string(key));
|
|
|
|
if (key % 3 == 0) {
|
|
|
|
value.append(",");
|
|
|
|
value.append("val_l1_" + std::to_string(key));
|
|
|
|
}
|
|
|
|
if (key % 5 == 0) {
|
|
|
|
value.append(",");
|
|
|
|
value.append("val_l0_" + std::to_string(key));
|
|
|
|
}
|
|
|
|
if (key % 9 == 0) {
|
|
|
|
value.append(",");
|
|
|
|
value.append("val_mem_" + std::to_string(key));
|
|
|
|
}
|
|
|
|
ASSERT_EQ(values[j], value);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
TEST_P(DBMultiGetTestWithParam, MultiGetBatchedValueSizeInMemory) {
|
|
|
|
#ifndef USE_COROUTINES
|
|
|
|
if (std::get<1>(GetParam())) {
|
|
|
|
ROCKSDB_GTEST_SKIP("This test requires coroutine support");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
#endif // USE_COROUTINES
|
|
|
|
// Skip for unbatched MultiGet
|
|
|
|
if (!std::get<0>(GetParam())) {
|
2022-05-31 20:02:50 +00:00
|
|
|
ROCKSDB_GTEST_BYPASS("This test is only for batched MultiGet");
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
return;
|
|
|
|
}
|
2020-05-27 20:03:08 +00:00
|
|
|
CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
|
|
|
|
SetPerfLevel(kEnableCount);
|
|
|
|
ASSERT_OK(Put(1, "k1", "v_1"));
|
|
|
|
ASSERT_OK(Put(1, "k2", "v_2"));
|
|
|
|
ASSERT_OK(Put(1, "k3", "v_3"));
|
|
|
|
ASSERT_OK(Put(1, "k4", "v_4"));
|
|
|
|
ASSERT_OK(Put(1, "k5", "v_5"));
|
|
|
|
ASSERT_OK(Put(1, "k6", "v_6"));
|
|
|
|
std::vector<Slice> keys = {"k1", "k2", "k3", "k4", "k5", "k6"};
|
|
|
|
std::vector<PinnableSlice> values(keys.size());
|
|
|
|
std::vector<Status> s(keys.size());
|
|
|
|
std::vector<ColumnFamilyHandle*> cfs(keys.size(), handles_[1]);
|
|
|
|
|
|
|
|
get_perf_context()->Reset();
|
|
|
|
ReadOptions ro;
|
|
|
|
ro.value_size_soft_limit = 11;
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
ro.async_io = std::get<1>(GetParam());
|
2020-05-27 20:03:08 +00:00
|
|
|
db_->MultiGet(ro, handles_[1], keys.size(), keys.data(), values.data(),
|
|
|
|
s.data(), false);
|
|
|
|
|
|
|
|
ASSERT_EQ(values.size(), keys.size());
|
|
|
|
for (unsigned int i = 0; i < 4; i++) {
|
|
|
|
ASSERT_EQ(std::string(values[i].data(), values[i].size()),
|
|
|
|
"v_" + std::to_string(i + 1));
|
|
|
|
}
|
|
|
|
|
|
|
|
for (unsigned int i = 4; i < 6; i++) {
|
|
|
|
ASSERT_TRUE(s[i].IsAborted());
|
|
|
|
}
|
|
|
|
|
|
|
|
ASSERT_EQ(12, (int)get_perf_context()->multiget_read_bytes);
|
|
|
|
SetPerfLevel(kDisable);
|
|
|
|
}
|
|
|
|
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
TEST_P(DBMultiGetTestWithParam, MultiGetBatchedValueSize) {
|
|
|
|
#ifndef USE_COROUTINES
|
|
|
|
if (std::get<1>(GetParam())) {
|
|
|
|
ROCKSDB_GTEST_SKIP("This test requires coroutine support");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
#endif // USE_COROUTINES
|
|
|
|
// Skip for unbatched MultiGet
|
|
|
|
if (!std::get<0>(GetParam())) {
|
|
|
|
return;
|
|
|
|
}
|
2020-05-27 20:03:08 +00:00
|
|
|
do {
|
|
|
|
CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
|
|
|
|
SetPerfLevel(kEnableCount);
|
|
|
|
|
|
|
|
ASSERT_OK(Put(1, "k6", "v6"));
|
|
|
|
ASSERT_OK(Put(1, "k7", "v7_"));
|
|
|
|
ASSERT_OK(Put(1, "k3", "v3_"));
|
|
|
|
ASSERT_OK(Put(1, "k4", "v4"));
|
2020-12-24 00:54:05 +00:00
|
|
|
ASSERT_OK(Flush(1));
|
2020-05-27 20:03:08 +00:00
|
|
|
ASSERT_OK(Delete(1, "k4"));
|
|
|
|
ASSERT_OK(Put(1, "k11", "v11"));
|
|
|
|
ASSERT_OK(Delete(1, "no_key"));
|
|
|
|
ASSERT_OK(Put(1, "k8", "v8_"));
|
|
|
|
ASSERT_OK(Put(1, "k13", "v13"));
|
|
|
|
ASSERT_OK(Put(1, "k14", "v14"));
|
|
|
|
ASSERT_OK(Put(1, "k15", "v15"));
|
|
|
|
ASSERT_OK(Put(1, "k16", "v16"));
|
|
|
|
ASSERT_OK(Put(1, "k17", "v17"));
|
2020-12-24 00:54:05 +00:00
|
|
|
ASSERT_OK(Flush(1));
|
2020-05-27 20:03:08 +00:00
|
|
|
|
|
|
|
ASSERT_OK(Put(1, "k1", "v1_"));
|
|
|
|
ASSERT_OK(Put(1, "k2", "v2_"));
|
|
|
|
ASSERT_OK(Put(1, "k5", "v5_"));
|
|
|
|
ASSERT_OK(Put(1, "k9", "v9_"));
|
|
|
|
ASSERT_OK(Put(1, "k10", "v10"));
|
|
|
|
ASSERT_OK(Delete(1, "k2"));
|
|
|
|
ASSERT_OK(Delete(1, "k6"));
|
|
|
|
|
|
|
|
get_perf_context()->Reset();
|
|
|
|
|
|
|
|
std::vector<Slice> keys({"k1", "k10", "k11", "k12", "k13", "k14", "k15",
|
|
|
|
"k16", "k17", "k2", "k3", "k4", "k5", "k6", "k7",
|
|
|
|
"k8", "k9", "no_key"});
|
|
|
|
std::vector<PinnableSlice> values(keys.size());
|
|
|
|
std::vector<ColumnFamilyHandle*> cfs(keys.size(), handles_[1]);
|
|
|
|
std::vector<Status> s(keys.size());
|
|
|
|
|
|
|
|
ReadOptions ro;
|
|
|
|
ro.value_size_soft_limit = 20;
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
ro.async_io = std::get<1>(GetParam());
|
2020-05-27 20:03:08 +00:00
|
|
|
db_->MultiGet(ro, handles_[1], keys.size(), keys.data(), values.data(),
|
|
|
|
s.data(), false);
|
|
|
|
|
|
|
|
ASSERT_EQ(values.size(), keys.size());
|
|
|
|
|
|
|
|
// In memory keys
|
|
|
|
ASSERT_EQ(std::string(values[0].data(), values[0].size()), "v1_");
|
|
|
|
ASSERT_EQ(std::string(values[1].data(), values[1].size()), "v10");
|
|
|
|
ASSERT_TRUE(s[9].IsNotFound()); // k2
|
|
|
|
ASSERT_EQ(std::string(values[12].data(), values[12].size()), "v5_");
|
|
|
|
ASSERT_TRUE(s[13].IsNotFound()); // k6
|
|
|
|
ASSERT_EQ(std::string(values[16].data(), values[16].size()), "v9_");
|
|
|
|
|
|
|
|
// In sst files
|
|
|
|
ASSERT_EQ(std::string(values[2].data(), values[1].size()), "v11");
|
|
|
|
ASSERT_EQ(std::string(values[4].data(), values[4].size()), "v13");
|
|
|
|
ASSERT_EQ(std::string(values[5].data(), values[5].size()), "v14");
|
|
|
|
|
|
|
|
// Remaining aborted after value_size exceeds.
|
|
|
|
ASSERT_TRUE(s[3].IsAborted());
|
|
|
|
ASSERT_TRUE(s[6].IsAborted());
|
|
|
|
ASSERT_TRUE(s[7].IsAborted());
|
|
|
|
ASSERT_TRUE(s[8].IsAborted());
|
|
|
|
ASSERT_TRUE(s[10].IsAborted());
|
|
|
|
ASSERT_TRUE(s[11].IsAborted());
|
|
|
|
ASSERT_TRUE(s[14].IsAborted());
|
|
|
|
ASSERT_TRUE(s[15].IsAborted());
|
|
|
|
ASSERT_TRUE(s[17].IsAborted());
|
|
|
|
|
|
|
|
// 6 kv pairs * 3 bytes per value (i.e. 18)
|
|
|
|
ASSERT_EQ(21, (int)get_perf_context()->multiget_read_bytes);
|
|
|
|
SetPerfLevel(kDisable);
|
|
|
|
} while (ChangeCompactOptions());
|
|
|
|
}
|
|
|
|
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
TEST_P(DBMultiGetTestWithParam, MultiGetBatchedValueSizeMultiLevelMerge) {
|
|
|
|
if (std::get<1>(GetParam())) {
|
2022-09-14 16:59:54 +00:00
|
|
|
ROCKSDB_GTEST_BYPASS("This test needs to be fixed for async IO");
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
// Skip for unbatched MultiGet
|
|
|
|
if (!std::get<0>(GetParam())) {
|
2022-05-31 20:02:50 +00:00
|
|
|
ROCKSDB_GTEST_BYPASS("This test is only for batched MultiGet");
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
return;
|
|
|
|
}
|
2020-05-27 20:03:08 +00:00
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.disable_auto_compactions = true;
|
|
|
|
options.merge_operator = MergeOperators::CreateStringAppendOperator();
|
|
|
|
BlockBasedTableOptions bbto;
|
|
|
|
bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(bbto));
|
|
|
|
Reopen(options);
|
|
|
|
int num_keys = 0;
|
|
|
|
|
|
|
|
for (int i = 0; i < 64; ++i) {
|
|
|
|
ASSERT_OK(Put("key_" + std::to_string(i), "val_l2_" + std::to_string(i)));
|
|
|
|
num_keys++;
|
|
|
|
if (num_keys == 8) {
|
2020-12-24 00:54:05 +00:00
|
|
|
ASSERT_OK(Flush());
|
2020-05-27 20:03:08 +00:00
|
|
|
num_keys = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (num_keys > 0) {
|
2020-12-24 00:54:05 +00:00
|
|
|
ASSERT_OK(Flush());
|
2020-05-27 20:03:08 +00:00
|
|
|
num_keys = 0;
|
|
|
|
}
|
|
|
|
MoveFilesToLevel(2);
|
|
|
|
|
|
|
|
for (int i = 0; i < 64; i += 3) {
|
|
|
|
ASSERT_OK(Merge("key_" + std::to_string(i), "val_l1_" + std::to_string(i)));
|
|
|
|
num_keys++;
|
|
|
|
if (num_keys == 8) {
|
2020-12-24 00:54:05 +00:00
|
|
|
ASSERT_OK(Flush());
|
2020-05-27 20:03:08 +00:00
|
|
|
num_keys = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (num_keys > 0) {
|
2020-12-24 00:54:05 +00:00
|
|
|
ASSERT_OK(Flush());
|
2020-05-27 20:03:08 +00:00
|
|
|
num_keys = 0;
|
|
|
|
}
|
|
|
|
MoveFilesToLevel(1);
|
|
|
|
|
|
|
|
for (int i = 0; i < 64; i += 5) {
|
|
|
|
ASSERT_OK(Merge("key_" + std::to_string(i), "val_l0_" + std::to_string(i)));
|
|
|
|
num_keys++;
|
|
|
|
if (num_keys == 8) {
|
2020-12-24 00:54:05 +00:00
|
|
|
ASSERT_OK(Flush());
|
2020-05-27 20:03:08 +00:00
|
|
|
num_keys = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (num_keys > 0) {
|
2020-12-24 00:54:05 +00:00
|
|
|
ASSERT_OK(Flush());
|
2020-05-27 20:03:08 +00:00
|
|
|
num_keys = 0;
|
|
|
|
}
|
|
|
|
ASSERT_EQ(0, num_keys);
|
|
|
|
|
|
|
|
for (int i = 0; i < 64; i += 9) {
|
|
|
|
ASSERT_OK(
|
|
|
|
Merge("key_" + std::to_string(i), "val_mem_" + std::to_string(i)));
|
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<std::string> keys_str;
|
|
|
|
for (int i = 10; i < 50; ++i) {
|
|
|
|
keys_str.push_back("key_" + std::to_string(i));
|
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<Slice> keys(keys_str.size());
|
|
|
|
for (int i = 0; i < 40; i++) {
|
|
|
|
keys[i] = Slice(keys_str[i]);
|
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<PinnableSlice> values(keys_str.size());
|
|
|
|
std::vector<Status> statuses(keys_str.size());
|
|
|
|
ReadOptions read_options;
|
|
|
|
read_options.verify_checksums = true;
|
|
|
|
read_options.value_size_soft_limit = 380;
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
read_options.async_io = std::get<1>(GetParam());
|
2020-05-27 20:03:08 +00:00
|
|
|
db_->MultiGet(read_options, dbfull()->DefaultColumnFamily(), keys.size(),
|
|
|
|
keys.data(), values.data(), statuses.data());
|
|
|
|
|
|
|
|
ASSERT_EQ(values.size(), keys.size());
|
|
|
|
|
|
|
|
for (unsigned int j = 0; j < 26; ++j) {
|
|
|
|
int key = j + 10;
|
|
|
|
std::string value;
|
|
|
|
value.append("val_l2_" + std::to_string(key));
|
|
|
|
if (key % 3 == 0) {
|
|
|
|
value.append(",");
|
|
|
|
value.append("val_l1_" + std::to_string(key));
|
|
|
|
}
|
|
|
|
if (key % 5 == 0) {
|
|
|
|
value.append(",");
|
|
|
|
value.append("val_l0_" + std::to_string(key));
|
|
|
|
}
|
|
|
|
if (key % 9 == 0) {
|
|
|
|
value.append(",");
|
|
|
|
value.append("val_mem_" + std::to_string(key));
|
|
|
|
}
|
|
|
|
ASSERT_EQ(values[j], value);
|
|
|
|
ASSERT_OK(statuses[j]);
|
|
|
|
}
|
|
|
|
|
|
|
|
// All remaning keys status is set Status::Abort
|
|
|
|
for (unsigned int j = 26; j < 40; j++) {
|
|
|
|
ASSERT_TRUE(statuses[j].IsAborted());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
INSTANTIATE_TEST_CASE_P(DBMultiGetTestWithParam, DBMultiGetTestWithParam,
|
|
|
|
testing::Combine(testing::Bool(), testing::Bool()));
|
|
|
|
|
|
|
|
#if USE_COROUTINES
|
2022-08-19 23:52:52 +00:00
|
|
|
class DBMultiGetAsyncIOTest : public DBBasicTest,
|
|
|
|
public ::testing::WithParamInterface<bool> {
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
public:
|
|
|
|
DBMultiGetAsyncIOTest()
|
|
|
|
: DBBasicTest(), statistics_(ROCKSDB_NAMESPACE::CreateDBStatistics()) {
|
|
|
|
BlockBasedTableOptions bbto;
|
|
|
|
bbto.filter_policy.reset(NewBloomFilterPolicy(10));
|
2022-12-05 06:58:25 +00:00
|
|
|
options_ = CurrentOptions();
|
|
|
|
options_.disable_auto_compactions = true;
|
|
|
|
options_.statistics = statistics_;
|
|
|
|
options_.table_factory.reset(NewBlockBasedTableFactory(bbto));
|
2023-02-03 00:35:27 +00:00
|
|
|
options_.env = Env::Default();
|
2022-12-05 06:58:25 +00:00
|
|
|
Reopen(options_);
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
int num_keys = 0;
|
|
|
|
|
|
|
|
// Put all keys in the bottommost level, and overwrite some keys
|
|
|
|
// in L0 and L1
|
2022-08-09 21:44:47 +00:00
|
|
|
for (int i = 0; i < 256; ++i) {
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
EXPECT_OK(Put(Key(i), "val_l2_" + std::to_string(i)));
|
|
|
|
num_keys++;
|
|
|
|
if (num_keys == 8) {
|
|
|
|
EXPECT_OK(Flush());
|
|
|
|
num_keys = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (num_keys > 0) {
|
|
|
|
EXPECT_OK(Flush());
|
|
|
|
num_keys = 0;
|
|
|
|
}
|
|
|
|
MoveFilesToLevel(2);
|
|
|
|
|
|
|
|
for (int i = 0; i < 128; i += 3) {
|
|
|
|
EXPECT_OK(Put(Key(i), "val_l1_" + std::to_string(i)));
|
|
|
|
num_keys++;
|
|
|
|
if (num_keys == 8) {
|
|
|
|
EXPECT_OK(Flush());
|
|
|
|
num_keys = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (num_keys > 0) {
|
|
|
|
EXPECT_OK(Flush());
|
|
|
|
num_keys = 0;
|
|
|
|
}
|
2022-08-09 21:44:47 +00:00
|
|
|
// Put some range deletes in L1
|
|
|
|
for (int i = 128; i < 256; i += 32) {
|
|
|
|
std::string range_begin = Key(i);
|
|
|
|
std::string range_end = Key(i + 16);
|
|
|
|
EXPECT_OK(dbfull()->DeleteRange(WriteOptions(),
|
|
|
|
dbfull()->DefaultColumnFamily(),
|
|
|
|
range_begin, range_end));
|
|
|
|
// Also do some Puts to force creation of bloom filter
|
|
|
|
for (int j = i + 16; j < i + 32; ++j) {
|
|
|
|
if (j % 3 == 0) {
|
|
|
|
EXPECT_OK(Put(Key(j), "val_l1_" + std::to_string(j)));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
EXPECT_OK(Flush());
|
|
|
|
}
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
MoveFilesToLevel(1);
|
|
|
|
|
|
|
|
for (int i = 0; i < 128; i += 5) {
|
|
|
|
EXPECT_OK(Put(Key(i), "val_l0_" + std::to_string(i)));
|
|
|
|
num_keys++;
|
|
|
|
if (num_keys == 8) {
|
|
|
|
EXPECT_OK(Flush());
|
|
|
|
num_keys = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (num_keys > 0) {
|
|
|
|
EXPECT_OK(Flush());
|
|
|
|
num_keys = 0;
|
|
|
|
}
|
|
|
|
EXPECT_EQ(0, num_keys);
|
|
|
|
}
|
|
|
|
|
|
|
|
const std::shared_ptr<Statistics>& statistics() { return statistics_; }
|
|
|
|
|
2022-12-05 06:58:25 +00:00
|
|
|
protected:
|
2023-02-03 00:35:27 +00:00
|
|
|
void PrepareDBForTest() {
|
|
|
|
#ifdef ROCKSDB_IOURING_PRESENT
|
|
|
|
Reopen(options_);
|
|
|
|
#else // ROCKSDB_IOURING_PRESENT
|
|
|
|
// Warm up the block cache so we don't need to use the IO uring
|
|
|
|
Iterator* iter = dbfull()->NewIterator(ReadOptions());
|
|
|
|
for (iter->SeekToFirst(); iter->Valid() && iter->status().ok();
|
|
|
|
iter->Next())
|
|
|
|
;
|
|
|
|
EXPECT_OK(iter->status());
|
|
|
|
delete iter;
|
|
|
|
#endif // ROCKSDB_IOURING_PRESENT
|
|
|
|
}
|
|
|
|
|
2022-12-05 06:58:25 +00:00
|
|
|
void ReopenDB() { Reopen(options_); }
|
|
|
|
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
private:
|
|
|
|
std::shared_ptr<Statistics> statistics_;
|
2022-12-05 06:58:25 +00:00
|
|
|
Options options_;
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
};
|
|
|
|
|
2022-08-19 23:52:52 +00:00
|
|
|
TEST_P(DBMultiGetAsyncIOTest, GetFromL0) {
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
// All 3 keys in L0. The L0 files should be read serially.
|
|
|
|
std::vector<std::string> key_strs{Key(0), Key(40), Key(80)};
|
|
|
|
std::vector<Slice> keys{key_strs[0], key_strs[1], key_strs[2]};
|
|
|
|
std::vector<PinnableSlice> values(key_strs.size());
|
|
|
|
std::vector<Status> statuses(key_strs.size());
|
|
|
|
|
2023-02-03 00:35:27 +00:00
|
|
|
PrepareDBForTest();
|
|
|
|
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
ReadOptions ro;
|
|
|
|
ro.async_io = true;
|
2022-08-19 23:52:52 +00:00
|
|
|
ro.optimize_multiget_for_io = GetParam();
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
|
|
|
|
keys.data(), values.data(), statuses.data());
|
|
|
|
ASSERT_EQ(values.size(), 3);
|
|
|
|
ASSERT_OK(statuses[0]);
|
|
|
|
ASSERT_OK(statuses[1]);
|
|
|
|
ASSERT_OK(statuses[2]);
|
|
|
|
ASSERT_EQ(values[0], "val_l0_" + std::to_string(0));
|
|
|
|
ASSERT_EQ(values[1], "val_l0_" + std::to_string(40));
|
|
|
|
ASSERT_EQ(values[2], "val_l0_" + std::to_string(80));
|
|
|
|
|
|
|
|
HistogramData multiget_io_batch_size;
|
|
|
|
|
|
|
|
statistics()->histogramData(MULTIGET_IO_BATCH_SIZE, &multiget_io_batch_size);
|
|
|
|
|
2022-08-19 23:52:52 +00:00
|
|
|
// With async IO, lookups will happen in parallel for each key
|
2023-02-03 00:35:27 +00:00
|
|
|
#ifdef ROCKSDB_IOURING_PRESENT
|
2022-08-19 23:52:52 +00:00
|
|
|
if (GetParam()) {
|
|
|
|
ASSERT_EQ(multiget_io_batch_size.count, 1);
|
|
|
|
ASSERT_EQ(multiget_io_batch_size.max, 3);
|
|
|
|
ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 3);
|
|
|
|
} else {
|
2022-09-01 04:03:52 +00:00
|
|
|
// Without Async IO, MultiGet will call MultiRead 3 times, once for each
|
|
|
|
// L0 file
|
|
|
|
ASSERT_EQ(multiget_io_batch_size.count, 3);
|
2022-08-19 23:52:52 +00:00
|
|
|
}
|
2023-02-03 00:35:27 +00:00
|
|
|
#else // ROCKSDB_IOURING_PRESENT
|
2023-03-17 21:57:09 +00:00
|
|
|
ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 0);
|
2023-02-03 00:35:27 +00:00
|
|
|
#endif // ROCKSDB_IOURING_PRESENT
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
}
|
|
|
|
|
2022-08-19 23:52:52 +00:00
|
|
|
TEST_P(DBMultiGetAsyncIOTest, GetFromL1) {
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
std::vector<std::string> key_strs;
|
|
|
|
std::vector<Slice> keys;
|
|
|
|
std::vector<PinnableSlice> values;
|
|
|
|
std::vector<Status> statuses;
|
|
|
|
|
|
|
|
key_strs.push_back(Key(33));
|
|
|
|
key_strs.push_back(Key(54));
|
|
|
|
key_strs.push_back(Key(102));
|
|
|
|
keys.push_back(key_strs[0]);
|
|
|
|
keys.push_back(key_strs[1]);
|
|
|
|
keys.push_back(key_strs[2]);
|
|
|
|
values.resize(keys.size());
|
|
|
|
statuses.resize(keys.size());
|
|
|
|
|
2023-02-03 00:35:27 +00:00
|
|
|
PrepareDBForTest();
|
|
|
|
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
ReadOptions ro;
|
|
|
|
ro.async_io = true;
|
2022-08-19 23:52:52 +00:00
|
|
|
ro.optimize_multiget_for_io = GetParam();
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
|
|
|
|
keys.data(), values.data(), statuses.data());
|
|
|
|
ASSERT_EQ(values.size(), 3);
|
|
|
|
ASSERT_EQ(statuses[0], Status::OK());
|
|
|
|
ASSERT_EQ(statuses[1], Status::OK());
|
|
|
|
ASSERT_EQ(statuses[2], Status::OK());
|
|
|
|
ASSERT_EQ(values[0], "val_l1_" + std::to_string(33));
|
|
|
|
ASSERT_EQ(values[1], "val_l1_" + std::to_string(54));
|
|
|
|
ASSERT_EQ(values[2], "val_l1_" + std::to_string(102));
|
|
|
|
|
|
|
|
HistogramData multiget_io_batch_size;
|
|
|
|
|
|
|
|
statistics()->histogramData(MULTIGET_IO_BATCH_SIZE, &multiget_io_batch_size);
|
|
|
|
|
2023-03-17 21:57:09 +00:00
|
|
|
#ifdef ROCKSDB_IOURING_PRESENT
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
// A batch of 3 async IOs is expected, one for each overlapping file in L1
|
|
|
|
ASSERT_EQ(multiget_io_batch_size.count, 1);
|
|
|
|
ASSERT_EQ(multiget_io_batch_size.max, 3);
|
2022-06-16 19:12:43 +00:00
|
|
|
ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 3);
|
2023-03-17 21:57:09 +00:00
|
|
|
#else // ROCKSDB_IOURING_PRESENT
|
|
|
|
ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 0);
|
|
|
|
#endif // ROCKSDB_IOURING_PRESENT
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
}
|
|
|
|
|
2023-02-03 00:35:27 +00:00
|
|
|
#ifdef ROCKSDB_IOURING_PRESENT
|
2022-12-05 06:58:25 +00:00
|
|
|
TEST_P(DBMultiGetAsyncIOTest, GetFromL1Error) {
|
|
|
|
std::vector<std::string> key_strs;
|
|
|
|
std::vector<Slice> keys;
|
|
|
|
std::vector<PinnableSlice> values;
|
|
|
|
std::vector<Status> statuses;
|
|
|
|
|
|
|
|
key_strs.push_back(Key(33));
|
|
|
|
key_strs.push_back(Key(54));
|
|
|
|
key_strs.push_back(Key(102));
|
|
|
|
keys.push_back(key_strs[0]);
|
|
|
|
keys.push_back(key_strs[1]);
|
|
|
|
keys.push_back(key_strs[2]);
|
|
|
|
values.resize(keys.size());
|
|
|
|
statuses.resize(keys.size());
|
|
|
|
|
2023-02-03 00:35:27 +00:00
|
|
|
int count = 0;
|
2022-12-05 06:58:25 +00:00
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"TableCache::GetTableReader:BeforeOpenFile", [&](void* status) {
|
|
|
|
count++;
|
|
|
|
// Fail the last table reader open, which is the 6th SST file
|
|
|
|
// since 3 overlapping L0 files + 3 L1 files containing the keys
|
|
|
|
if (count == 6) {
|
|
|
|
Status* s = static_cast<Status*>(status);
|
|
|
|
*s = Status::IOError();
|
|
|
|
}
|
|
|
|
});
|
|
|
|
// DB open will create table readers unless we reduce the table cache
|
|
|
|
// capacity.
|
|
|
|
// SanitizeOptions will set max_open_files to minimum of 20. Table cache
|
|
|
|
// is allocated with max_open_files - 10 as capacity. So override
|
|
|
|
// max_open_files to 11 so table cache capacity will become 1. This will
|
|
|
|
// prevent file open during DB open and force the file to be opened
|
|
|
|
// during MultiGet
|
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
|
|
|
|
int* max_open_files = (int*)arg;
|
|
|
|
*max_open_files = 11;
|
|
|
|
});
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
2023-02-03 00:35:27 +00:00
|
|
|
PrepareDBForTest();
|
2022-12-05 06:58:25 +00:00
|
|
|
|
|
|
|
ReadOptions ro;
|
|
|
|
ro.async_io = true;
|
|
|
|
ro.optimize_multiget_for_io = GetParam();
|
|
|
|
dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
|
|
|
|
keys.data(), values.data(), statuses.data());
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
ASSERT_EQ(values.size(), 3);
|
|
|
|
ASSERT_EQ(statuses[0], Status::OK());
|
|
|
|
ASSERT_EQ(statuses[1], Status::OK());
|
|
|
|
ASSERT_EQ(statuses[2], Status::IOError());
|
|
|
|
|
|
|
|
HistogramData multiget_io_batch_size;
|
|
|
|
|
|
|
|
statistics()->histogramData(MULTIGET_IO_BATCH_SIZE, &multiget_io_batch_size);
|
|
|
|
|
|
|
|
// A batch of 3 async IOs is expected, one for each overlapping file in L1
|
|
|
|
ASSERT_EQ(multiget_io_batch_size.count, 1);
|
|
|
|
ASSERT_EQ(multiget_io_batch_size.max, 2);
|
|
|
|
ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 2);
|
|
|
|
}
|
2023-02-03 00:35:27 +00:00
|
|
|
#endif // ROCKSDB_IOURING_PRESENT
|
2022-12-05 06:58:25 +00:00
|
|
|
|
2022-08-19 23:52:52 +00:00
|
|
|
TEST_P(DBMultiGetAsyncIOTest, LastKeyInFile) {
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
std::vector<std::string> key_strs;
|
|
|
|
std::vector<Slice> keys;
|
|
|
|
std::vector<PinnableSlice> values;
|
|
|
|
std::vector<Status> statuses;
|
|
|
|
|
|
|
|
// 21 is the last key in the first L1 file
|
|
|
|
key_strs.push_back(Key(21));
|
|
|
|
key_strs.push_back(Key(54));
|
|
|
|
key_strs.push_back(Key(102));
|
|
|
|
keys.push_back(key_strs[0]);
|
|
|
|
keys.push_back(key_strs[1]);
|
|
|
|
keys.push_back(key_strs[2]);
|
|
|
|
values.resize(keys.size());
|
|
|
|
statuses.resize(keys.size());
|
|
|
|
|
2023-02-03 00:35:27 +00:00
|
|
|
PrepareDBForTest();
|
|
|
|
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
ReadOptions ro;
|
|
|
|
ro.async_io = true;
|
2022-08-19 23:52:52 +00:00
|
|
|
ro.optimize_multiget_for_io = GetParam();
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
|
|
|
|
keys.data(), values.data(), statuses.data());
|
|
|
|
ASSERT_EQ(values.size(), 3);
|
|
|
|
ASSERT_EQ(statuses[0], Status::OK());
|
|
|
|
ASSERT_EQ(statuses[1], Status::OK());
|
|
|
|
ASSERT_EQ(statuses[2], Status::OK());
|
|
|
|
ASSERT_EQ(values[0], "val_l1_" + std::to_string(21));
|
|
|
|
ASSERT_EQ(values[1], "val_l1_" + std::to_string(54));
|
|
|
|
ASSERT_EQ(values[2], "val_l1_" + std::to_string(102));
|
|
|
|
|
2023-02-03 00:35:27 +00:00
|
|
|
#ifdef ROCKSDB_IOURING_PRESENT
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
HistogramData multiget_io_batch_size;
|
|
|
|
|
|
|
|
statistics()->histogramData(MULTIGET_IO_BATCH_SIZE, &multiget_io_batch_size);
|
|
|
|
|
|
|
|
// Since the first MultiGet key is the last key in a file, the MultiGet is
|
|
|
|
// expected to lookup in that file first, before moving on to other files.
|
|
|
|
// So the first file lookup will issue one async read, and the next lookup
|
|
|
|
// will lookup 2 files in parallel and issue 2 async reads
|
|
|
|
ASSERT_EQ(multiget_io_batch_size.count, 2);
|
|
|
|
ASSERT_EQ(multiget_io_batch_size.max, 2);
|
2023-02-03 00:35:27 +00:00
|
|
|
#endif // ROCKSDB_IOURING_PRESENT
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
}
|
|
|
|
|
2022-08-19 23:52:52 +00:00
|
|
|
TEST_P(DBMultiGetAsyncIOTest, GetFromL1AndL2) {
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
std::vector<std::string> key_strs;
|
|
|
|
std::vector<Slice> keys;
|
|
|
|
std::vector<PinnableSlice> values;
|
|
|
|
std::vector<Status> statuses;
|
|
|
|
|
|
|
|
// 33 and 102 are in L1, and 56 is in L2
|
|
|
|
key_strs.push_back(Key(33));
|
|
|
|
key_strs.push_back(Key(56));
|
|
|
|
key_strs.push_back(Key(102));
|
|
|
|
keys.push_back(key_strs[0]);
|
|
|
|
keys.push_back(key_strs[1]);
|
|
|
|
keys.push_back(key_strs[2]);
|
|
|
|
values.resize(keys.size());
|
|
|
|
statuses.resize(keys.size());
|
|
|
|
|
2023-02-03 00:35:27 +00:00
|
|
|
PrepareDBForTest();
|
|
|
|
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
ReadOptions ro;
|
|
|
|
ro.async_io = true;
|
2022-08-19 23:52:52 +00:00
|
|
|
ro.optimize_multiget_for_io = GetParam();
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
|
|
|
|
keys.data(), values.data(), statuses.data());
|
|
|
|
ASSERT_EQ(values.size(), 3);
|
|
|
|
ASSERT_EQ(statuses[0], Status::OK());
|
|
|
|
ASSERT_EQ(statuses[1], Status::OK());
|
|
|
|
ASSERT_EQ(statuses[2], Status::OK());
|
|
|
|
ASSERT_EQ(values[0], "val_l1_" + std::to_string(33));
|
|
|
|
ASSERT_EQ(values[1], "val_l2_" + std::to_string(56));
|
|
|
|
ASSERT_EQ(values[2], "val_l1_" + std::to_string(102));
|
|
|
|
|
2023-02-03 00:35:27 +00:00
|
|
|
#ifdef ROCKSDB_IOURING_PRESENT
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
HistogramData multiget_io_batch_size;
|
|
|
|
|
|
|
|
statistics()->histogramData(MULTIGET_IO_BATCH_SIZE, &multiget_io_batch_size);
|
|
|
|
|
2022-08-19 23:52:52 +00:00
|
|
|
// There are 2 keys in L1 in twp separate files, and 1 in L2. With
|
2022-09-01 04:03:52 +00:00
|
|
|
// optimize_multiget_for_io, all three lookups will happen in parallel.
|
|
|
|
// Otherwise, the L2 lookup will happen after L1.
|
|
|
|
ASSERT_EQ(multiget_io_batch_size.count, GetParam() ? 1 : 2);
|
2022-08-19 23:52:52 +00:00
|
|
|
ASSERT_EQ(multiget_io_batch_size.max, GetParam() ? 3 : 2);
|
2023-02-03 00:35:27 +00:00
|
|
|
#endif // ROCKSDB_IOURING_PRESENT
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
}
|
2022-08-04 19:51:57 +00:00
|
|
|
|
2022-08-19 23:52:52 +00:00
|
|
|
TEST_P(DBMultiGetAsyncIOTest, GetFromL2WithRangeOverlapL0L1) {
|
2022-08-04 19:51:57 +00:00
|
|
|
std::vector<std::string> key_strs;
|
|
|
|
std::vector<Slice> keys;
|
|
|
|
std::vector<PinnableSlice> values;
|
|
|
|
std::vector<Status> statuses;
|
|
|
|
|
|
|
|
// 19 and 26 are in L2, but overlap with L0 and L1 file ranges
|
|
|
|
key_strs.push_back(Key(19));
|
|
|
|
key_strs.push_back(Key(26));
|
|
|
|
keys.push_back(key_strs[0]);
|
|
|
|
keys.push_back(key_strs[1]);
|
|
|
|
values.resize(keys.size());
|
|
|
|
statuses.resize(keys.size());
|
|
|
|
|
2023-02-03 00:35:27 +00:00
|
|
|
PrepareDBForTest();
|
|
|
|
|
2022-08-04 19:51:57 +00:00
|
|
|
ReadOptions ro;
|
|
|
|
ro.async_io = true;
|
2022-08-19 23:52:52 +00:00
|
|
|
ro.optimize_multiget_for_io = GetParam();
|
2022-08-04 19:51:57 +00:00
|
|
|
dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
|
|
|
|
keys.data(), values.data(), statuses.data());
|
|
|
|
ASSERT_EQ(values.size(), 2);
|
|
|
|
ASSERT_EQ(statuses[0], Status::OK());
|
|
|
|
ASSERT_EQ(statuses[1], Status::OK());
|
|
|
|
ASSERT_EQ(values[0], "val_l2_" + std::to_string(19));
|
|
|
|
ASSERT_EQ(values[1], "val_l2_" + std::to_string(26));
|
|
|
|
|
2023-03-17 21:57:09 +00:00
|
|
|
#ifdef ROCKSDB_IOURING_PRESENT
|
2022-08-04 19:51:57 +00:00
|
|
|
// Bloom filters in L0/L1 will avoid the coroutine calls in those levels
|
|
|
|
ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 2);
|
2023-03-17 21:57:09 +00:00
|
|
|
#else // ROCKSDB_IOURING_PRESENT
|
|
|
|
ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 0);
|
|
|
|
#endif // ROCKSDB_IOURING_PRESENT
|
2022-08-04 19:51:57 +00:00
|
|
|
}
|
2022-08-09 21:44:47 +00:00
|
|
|
|
2023-02-03 00:35:27 +00:00
|
|
|
#ifdef ROCKSDB_IOURING_PRESENT
|
2022-08-19 23:52:52 +00:00
|
|
|
TEST_P(DBMultiGetAsyncIOTest, GetFromL2WithRangeDelInL1) {
|
2022-08-09 21:44:47 +00:00
|
|
|
std::vector<std::string> key_strs;
|
|
|
|
std::vector<Slice> keys;
|
|
|
|
std::vector<PinnableSlice> values;
|
|
|
|
std::vector<Status> statuses;
|
|
|
|
|
|
|
|
// 139 and 163 are in L2, but overlap with a range deletes in L1
|
|
|
|
key_strs.push_back(Key(139));
|
|
|
|
key_strs.push_back(Key(163));
|
|
|
|
keys.push_back(key_strs[0]);
|
|
|
|
keys.push_back(key_strs[1]);
|
|
|
|
values.resize(keys.size());
|
|
|
|
statuses.resize(keys.size());
|
|
|
|
|
2023-02-03 00:35:27 +00:00
|
|
|
PrepareDBForTest();
|
|
|
|
|
2022-08-09 21:44:47 +00:00
|
|
|
ReadOptions ro;
|
|
|
|
ro.async_io = true;
|
2022-08-19 23:52:52 +00:00
|
|
|
ro.optimize_multiget_for_io = GetParam();
|
2022-08-09 21:44:47 +00:00
|
|
|
dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
|
|
|
|
keys.data(), values.data(), statuses.data());
|
|
|
|
ASSERT_EQ(values.size(), 2);
|
|
|
|
ASSERT_EQ(statuses[0], Status::NotFound());
|
|
|
|
ASSERT_EQ(statuses[1], Status::NotFound());
|
|
|
|
|
|
|
|
// Bloom filters in L0/L1 will avoid the coroutine calls in those levels
|
|
|
|
ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 2);
|
|
|
|
}
|
2022-08-17 20:51:39 +00:00
|
|
|
|
2022-08-19 23:52:52 +00:00
|
|
|
TEST_P(DBMultiGetAsyncIOTest, GetFromL1AndL2WithRangeDelInL1) {
|
2022-08-17 20:51:39 +00:00
|
|
|
std::vector<std::string> key_strs;
|
|
|
|
std::vector<Slice> keys;
|
|
|
|
std::vector<PinnableSlice> values;
|
|
|
|
std::vector<Status> statuses;
|
|
|
|
|
|
|
|
// 139 and 163 are in L2, but overlap with a range deletes in L1
|
|
|
|
key_strs.push_back(Key(139));
|
|
|
|
key_strs.push_back(Key(144));
|
|
|
|
key_strs.push_back(Key(163));
|
|
|
|
keys.push_back(key_strs[0]);
|
|
|
|
keys.push_back(key_strs[1]);
|
|
|
|
keys.push_back(key_strs[2]);
|
|
|
|
values.resize(keys.size());
|
|
|
|
statuses.resize(keys.size());
|
|
|
|
|
2023-02-03 00:35:27 +00:00
|
|
|
PrepareDBForTest();
|
|
|
|
|
2022-08-17 20:51:39 +00:00
|
|
|
ReadOptions ro;
|
|
|
|
ro.async_io = true;
|
2022-08-19 23:52:52 +00:00
|
|
|
ro.optimize_multiget_for_io = GetParam();
|
2022-08-17 20:51:39 +00:00
|
|
|
dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
|
|
|
|
keys.data(), values.data(), statuses.data());
|
|
|
|
ASSERT_EQ(values.size(), keys.size());
|
|
|
|
ASSERT_EQ(statuses[0], Status::NotFound());
|
|
|
|
ASSERT_EQ(statuses[1], Status::OK());
|
|
|
|
ASSERT_EQ(values[1], "val_l1_" + std::to_string(144));
|
|
|
|
ASSERT_EQ(statuses[2], Status::NotFound());
|
|
|
|
|
|
|
|
// Bloom filters in L0/L1 will avoid the coroutine calls in those levels
|
|
|
|
ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 3);
|
|
|
|
}
|
2023-02-03 00:35:27 +00:00
|
|
|
#endif // ROCKSDB_IOURING_PRESENT
|
|
|
|
|
|
|
|
TEST_P(DBMultiGetAsyncIOTest, GetNoIOUring) {
|
|
|
|
std::vector<std::string> key_strs;
|
|
|
|
std::vector<Slice> keys;
|
|
|
|
std::vector<PinnableSlice> values;
|
|
|
|
std::vector<Status> statuses;
|
|
|
|
|
|
|
|
key_strs.push_back(Key(33));
|
|
|
|
key_strs.push_back(Key(54));
|
|
|
|
key_strs.push_back(Key(102));
|
|
|
|
keys.push_back(key_strs[0]);
|
|
|
|
keys.push_back(key_strs[1]);
|
|
|
|
keys.push_back(key_strs[2]);
|
|
|
|
values.resize(keys.size());
|
|
|
|
statuses.resize(keys.size());
|
|
|
|
|
|
|
|
enable_io_uring = false;
|
|
|
|
ReopenDB();
|
|
|
|
|
|
|
|
ReadOptions ro;
|
|
|
|
ro.async_io = true;
|
|
|
|
ro.optimize_multiget_for_io = GetParam();
|
|
|
|
dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
|
|
|
|
keys.data(), values.data(), statuses.data());
|
|
|
|
ASSERT_EQ(values.size(), 3);
|
2023-03-17 21:57:09 +00:00
|
|
|
ASSERT_EQ(statuses[0], Status::OK());
|
|
|
|
ASSERT_EQ(statuses[1], Status::OK());
|
|
|
|
ASSERT_EQ(statuses[2], Status::OK());
|
2023-02-03 00:35:27 +00:00
|
|
|
|
2023-03-17 21:57:09 +00:00
|
|
|
HistogramData async_read_bytes;
|
2023-02-03 00:35:27 +00:00
|
|
|
|
2023-03-17 21:57:09 +00:00
|
|
|
statistics()->histogramData(ASYNC_READ_BYTES, &async_read_bytes);
|
2023-02-03 00:35:27 +00:00
|
|
|
|
|
|
|
// A batch of 3 async IOs is expected, one for each overlapping file in L1
|
2023-03-17 21:57:09 +00:00
|
|
|
ASSERT_EQ(async_read_bytes.count, 0);
|
|
|
|
ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 0);
|
2023-02-03 00:35:27 +00:00
|
|
|
}
|
2022-08-19 23:52:52 +00:00
|
|
|
|
|
|
|
INSTANTIATE_TEST_CASE_P(DBMultiGetAsyncIOTest, DBMultiGetAsyncIOTest,
|
|
|
|
testing::Bool());
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
#endif // USE_COROUTINES
|
|
|
|
|
2020-10-07 20:27:03 +00:00
|
|
|
TEST_F(DBBasicTest, MultiGetStats) {
|
|
|
|
Options options;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
options.disable_auto_compactions = true;
|
Fix many tests to run with MEM_ENV and ENCRYPTED_ENV; Introduce a MemoryFileSystem class (#7566)
Summary:
This PR does a few things:
1. The MockFileSystem class was split out from the MockEnv. This change would theoretically allow a MockFileSystem to be used by other Environments as well (if we created a means of constructing one). The MockFileSystem implements a FileSystem in its entirety and does not rely on any Wrapper implementation.
2. Make the RocksDB test suite work when MOCK_ENV=1 and ENCRYPTED_ENV=1 are set. To accomplish this, a few things were needed:
- The tests that tried to use the "wrong" environment (Env::Default() instead of env_) were updated
- The MockFileSystem was changed to support the features it was missing or mishandled (such as recursively deleting files in a directory or supporting renaming of a directory).
3. Updated the test framework to have a ROCKSDB_GTEST_SKIP macro. This can be used to flag tests that are skipped. Currently, this defaults to doing nothing (marks the test as SUCCESS) but will mark the tests as SKIPPED when RocksDB is upgraded to a version of gtest that supports this (gtest-1.10).
I have run a full "make check" with MEM_ENV, ENCRYPTED_ENV, both, and neither under both MacOS and RedHat. A few tests were disabled/skipped for the MEM/ENCRYPTED cases. The error_handler_fs_test fails/hangs for MEM_ENV (presumably a timing problem) and I will introduce another PR/issue to track that problem. (I will also push a change to disable those tests soon). There is one more test in DBTest2 that also fails which I need to investigate or skip before this PR is merged.
Theoretically, this PR should also allow the test suite to run against an Env loaded from the registry, though I do not have one to try it with currently.
Finally, once this is accepted, it would be nice if there was a CircleCI job to run these tests on a checkin so this effort does not become stale. I do not know how to do that, so if someone could write that job, it would be appreciated :)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7566
Reviewed By: zhichao-cao
Differential Revision: D24408980
Pulled By: jay-zhuang
fbshipit-source-id: 911b1554a4d0da06fd51feca0c090a4abdcb4a5f
2020-10-27 17:31:34 +00:00
|
|
|
options.env = env_;
|
2020-10-07 20:27:03 +00:00
|
|
|
options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
|
|
|
|
BlockBasedTableOptions table_options;
|
|
|
|
table_options.block_size = 1;
|
|
|
|
table_options.index_type =
|
|
|
|
BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
|
|
|
|
table_options.partition_filters = true;
|
|
|
|
table_options.no_block_cache = true;
|
|
|
|
table_options.cache_index_and_filter_blocks = false;
|
|
|
|
table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
|
Fix many tests to run with MEM_ENV and ENCRYPTED_ENV; Introduce a MemoryFileSystem class (#7566)
Summary:
This PR does a few things:
1. The MockFileSystem class was split out from the MockEnv. This change would theoretically allow a MockFileSystem to be used by other Environments as well (if we created a means of constructing one). The MockFileSystem implements a FileSystem in its entirety and does not rely on any Wrapper implementation.
2. Make the RocksDB test suite work when MOCK_ENV=1 and ENCRYPTED_ENV=1 are set. To accomplish this, a few things were needed:
- The tests that tried to use the "wrong" environment (Env::Default() instead of env_) were updated
- The MockFileSystem was changed to support the features it was missing or mishandled (such as recursively deleting files in a directory or supporting renaming of a directory).
3. Updated the test framework to have a ROCKSDB_GTEST_SKIP macro. This can be used to flag tests that are skipped. Currently, this defaults to doing nothing (marks the test as SUCCESS) but will mark the tests as SKIPPED when RocksDB is upgraded to a version of gtest that supports this (gtest-1.10).
I have run a full "make check" with MEM_ENV, ENCRYPTED_ENV, both, and neither under both MacOS and RedHat. A few tests were disabled/skipped for the MEM/ENCRYPTED cases. The error_handler_fs_test fails/hangs for MEM_ENV (presumably a timing problem) and I will introduce another PR/issue to track that problem. (I will also push a change to disable those tests soon). There is one more test in DBTest2 that also fails which I need to investigate or skip before this PR is merged.
Theoretically, this PR should also allow the test suite to run against an Env loaded from the registry, though I do not have one to try it with currently.
Finally, once this is accepted, it would be nice if there was a CircleCI job to run these tests on a checkin so this effort does not become stale. I do not know how to do that, so if someone could write that job, it would be appreciated :)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7566
Reviewed By: zhichao-cao
Differential Revision: D24408980
Pulled By: jay-zhuang
fbshipit-source-id: 911b1554a4d0da06fd51feca0c090a4abdcb4a5f
2020-10-27 17:31:34 +00:00
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
2020-10-07 20:27:03 +00:00
|
|
|
CreateAndReopenWithCF({"pikachu"}, options);
|
|
|
|
|
|
|
|
int total_keys = 2000;
|
|
|
|
std::vector<std::string> keys_str(total_keys);
|
|
|
|
std::vector<Slice> keys(total_keys);
|
2022-02-18 00:30:40 +00:00
|
|
|
static size_t kMultiGetBatchSize = 100;
|
|
|
|
std::vector<PinnableSlice> values(kMultiGetBatchSize);
|
|
|
|
std::vector<Status> s(kMultiGetBatchSize);
|
2020-10-07 20:27:03 +00:00
|
|
|
ReadOptions read_opts;
|
|
|
|
|
|
|
|
Random rnd(309);
|
|
|
|
// Create Multiple SST files at multiple levels.
|
|
|
|
for (int i = 0; i < 500; ++i) {
|
|
|
|
keys_str[i] = "k" + std::to_string(i);
|
|
|
|
keys[i] = Slice(keys_str[i]);
|
|
|
|
ASSERT_OK(Put(1, "k" + std::to_string(i), rnd.RandomString(1000)));
|
|
|
|
if (i % 100 == 0) {
|
2020-12-24 00:54:05 +00:00
|
|
|
ASSERT_OK(Flush(1));
|
2020-10-07 20:27:03 +00:00
|
|
|
}
|
|
|
|
}
|
2020-12-24 00:54:05 +00:00
|
|
|
ASSERT_OK(Flush(1));
|
2020-10-07 20:27:03 +00:00
|
|
|
MoveFilesToLevel(2, 1);
|
|
|
|
|
|
|
|
for (int i = 501; i < 1000; ++i) {
|
|
|
|
keys_str[i] = "k" + std::to_string(i);
|
|
|
|
keys[i] = Slice(keys_str[i]);
|
|
|
|
ASSERT_OK(Put(1, "k" + std::to_string(i), rnd.RandomString(1000)));
|
|
|
|
if (i % 100 == 0) {
|
2020-12-24 00:54:05 +00:00
|
|
|
ASSERT_OK(Flush(1));
|
2020-10-07 20:27:03 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-12-24 00:54:05 +00:00
|
|
|
ASSERT_OK(Flush(1));
|
2020-10-07 20:27:03 +00:00
|
|
|
MoveFilesToLevel(2, 1);
|
|
|
|
|
|
|
|
for (int i = 1001; i < total_keys; ++i) {
|
|
|
|
keys_str[i] = "k" + std::to_string(i);
|
|
|
|
keys[i] = Slice(keys_str[i]);
|
|
|
|
ASSERT_OK(Put(1, "k" + std::to_string(i), rnd.RandomString(1000)));
|
|
|
|
if (i % 100 == 0) {
|
2020-12-24 00:54:05 +00:00
|
|
|
ASSERT_OK(Flush(1));
|
2020-10-07 20:27:03 +00:00
|
|
|
}
|
|
|
|
}
|
2020-12-24 00:54:05 +00:00
|
|
|
ASSERT_OK(Flush(1));
|
2022-02-18 00:30:40 +00:00
|
|
|
MoveFilesToLevel(1, 1);
|
2020-10-07 20:27:03 +00:00
|
|
|
Close();
|
|
|
|
|
|
|
|
ReopenWithColumnFamilies({"default", "pikachu"}, options);
|
|
|
|
ASSERT_OK(options.statistics->Reset());
|
|
|
|
|
2022-02-18 00:30:40 +00:00
|
|
|
db_->MultiGet(read_opts, handles_[1], kMultiGetBatchSize, &keys[1250],
|
|
|
|
values.data(), s.data(), false);
|
2020-10-07 20:27:03 +00:00
|
|
|
|
2022-02-18 00:30:40 +00:00
|
|
|
ASSERT_EQ(values.size(), kMultiGetBatchSize);
|
2022-06-16 19:12:43 +00:00
|
|
|
HistogramData hist_level;
|
2020-10-07 20:27:03 +00:00
|
|
|
HistogramData hist_index_and_filter_blocks;
|
|
|
|
HistogramData hist_sst;
|
|
|
|
|
2022-06-16 19:12:43 +00:00
|
|
|
options.statistics->histogramData(NUM_LEVEL_READ_PER_MULTIGET, &hist_level);
|
2020-10-07 20:27:03 +00:00
|
|
|
options.statistics->histogramData(NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL,
|
|
|
|
&hist_index_and_filter_blocks);
|
|
|
|
options.statistics->histogramData(NUM_SST_READ_PER_LEVEL, &hist_sst);
|
|
|
|
|
|
|
|
// Maximum number of blocks read from a file system in a level.
|
2022-06-16 19:12:43 +00:00
|
|
|
ASSERT_EQ(hist_level.max, 1);
|
2020-10-07 20:27:03 +00:00
|
|
|
ASSERT_GT(hist_index_and_filter_blocks.max, 0);
|
|
|
|
// Maximum number of sst files read from file system in a level.
|
2022-02-18 00:30:40 +00:00
|
|
|
ASSERT_EQ(hist_sst.max, 2);
|
2020-10-07 20:27:03 +00:00
|
|
|
|
|
|
|
// Minimun number of blocks read in a level.
|
2022-06-16 19:12:43 +00:00
|
|
|
ASSERT_EQ(hist_level.min, 1);
|
2020-10-07 20:27:03 +00:00
|
|
|
ASSERT_GT(hist_index_and_filter_blocks.min, 0);
|
|
|
|
// Minimun number of sst files read in a level.
|
2022-02-18 00:30:40 +00:00
|
|
|
ASSERT_EQ(hist_sst.min, 1);
|
2022-06-16 19:12:43 +00:00
|
|
|
|
|
|
|
for (PinnableSlice& value : values) {
|
|
|
|
value.Reset();
|
|
|
|
}
|
|
|
|
for (Status& status : s) {
|
|
|
|
status = Status::OK();
|
|
|
|
}
|
|
|
|
db_->MultiGet(read_opts, handles_[1], kMultiGetBatchSize, &keys[950],
|
|
|
|
values.data(), s.data(), false);
|
|
|
|
options.statistics->histogramData(NUM_LEVEL_READ_PER_MULTIGET, &hist_level);
|
|
|
|
ASSERT_EQ(hist_level.max, 2);
|
2020-10-07 20:27:03 +00:00
|
|
|
}
|
|
|
|
|
2019-08-21 17:21:41 +00:00
|
|
|
// Test class for batched MultiGet with prefix extractor
|
|
|
|
// Param bool - If true, use partitioned filters
|
|
|
|
// If false, use full filter block
|
2019-09-20 19:00:55 +00:00
|
|
|
class MultiGetPrefixExtractorTest : public DBBasicTest,
|
|
|
|
public ::testing::WithParamInterface<bool> {
|
|
|
|
};
|
2019-08-21 17:21:41 +00:00
|
|
|
|
|
|
|
TEST_P(MultiGetPrefixExtractorTest, Batched) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.prefix_extractor.reset(NewFixedPrefixTransform(2));
|
2019-10-10 16:37:38 +00:00
|
|
|
options.memtable_prefix_bloom_size_ratio = 10;
|
2019-08-21 17:21:41 +00:00
|
|
|
BlockBasedTableOptions bbto;
|
|
|
|
if (GetParam()) {
|
|
|
|
bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
|
|
|
|
bbto.partition_filters = true;
|
|
|
|
}
|
|
|
|
bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
|
|
|
|
bbto.whole_key_filtering = false;
|
|
|
|
bbto.cache_index_and_filter_blocks = false;
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(bbto));
|
|
|
|
Reopen(options);
|
|
|
|
|
2019-10-10 16:37:38 +00:00
|
|
|
SetPerfLevel(kEnableCount);
|
|
|
|
get_perf_context()->Reset();
|
|
|
|
|
2019-08-21 17:21:41 +00:00
|
|
|
ASSERT_OK(Put("k", "v0"));
|
|
|
|
ASSERT_OK(Put("kk1", "v1"));
|
|
|
|
ASSERT_OK(Put("kk2", "v2"));
|
|
|
|
ASSERT_OK(Put("kk3", "v3"));
|
|
|
|
ASSERT_OK(Put("kk4", "v4"));
|
2022-06-23 18:00:27 +00:00
|
|
|
std::vector<std::string> keys(
|
2019-10-10 16:37:38 +00:00
|
|
|
{"k", "kk1", "kk2", "kk3", "kk4", "rofl", "lmho"});
|
2022-06-23 18:00:27 +00:00
|
|
|
std::vector<std::string> expected(
|
|
|
|
{"v0", "v1", "v2", "v3", "v4", "NOT_FOUND", "NOT_FOUND"});
|
|
|
|
std::vector<std::string> values;
|
|
|
|
values = MultiGet(keys, nullptr);
|
|
|
|
ASSERT_EQ(values, expected);
|
|
|
|
// One key ("k") is not queried against the filter because it is outside
|
|
|
|
// the prefix_extractor domain, leaving 6 keys with queried prefixes.
|
2019-10-10 16:37:38 +00:00
|
|
|
ASSERT_EQ(get_perf_context()->bloom_memtable_miss_count, 2);
|
2022-06-23 18:00:27 +00:00
|
|
|
ASSERT_EQ(get_perf_context()->bloom_memtable_hit_count, 4);
|
2019-08-21 17:21:41 +00:00
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
get_perf_context()->Reset();
|
|
|
|
values = MultiGet(keys, nullptr);
|
2022-06-23 18:00:27 +00:00
|
|
|
ASSERT_EQ(values, expected);
|
|
|
|
ASSERT_EQ(get_perf_context()->bloom_sst_miss_count, 2);
|
|
|
|
ASSERT_EQ(get_perf_context()->bloom_sst_hit_count, 4);
|
|
|
|
|
|
|
|
// Also check Get stat
|
|
|
|
get_perf_context()->Reset();
|
|
|
|
for (size_t i = 0; i < keys.size(); ++i) {
|
|
|
|
values[i] = Get(keys[i]);
|
|
|
|
}
|
|
|
|
ASSERT_EQ(values, expected);
|
|
|
|
ASSERT_EQ(get_perf_context()->bloom_sst_miss_count, 2);
|
2019-08-21 17:21:41 +00:00
|
|
|
ASSERT_EQ(get_perf_context()->bloom_sst_hit_count, 4);
|
|
|
|
}
|
|
|
|
|
2020-06-03 22:53:09 +00:00
|
|
|
INSTANTIATE_TEST_CASE_P(MultiGetPrefix, MultiGetPrefixExtractorTest,
|
|
|
|
::testing::Bool());
|
2019-08-21 17:21:41 +00:00
|
|
|
|
2019-09-20 19:00:55 +00:00
|
|
|
class DBMultiGetRowCacheTest : public DBBasicTest,
|
|
|
|
public ::testing::WithParamInterface<bool> {};
|
2019-08-28 23:10:38 +00:00
|
|
|
|
|
|
|
TEST_P(DBMultiGetRowCacheTest, MultiGetBatched) {
|
|
|
|
do {
|
|
|
|
option_config_ = kRowCache;
|
|
|
|
Options options = CurrentOptions();
|
2020-02-20 20:07:53 +00:00
|
|
|
options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
|
2019-08-28 23:10:38 +00:00
|
|
|
CreateAndReopenWithCF({"pikachu"}, options);
|
|
|
|
SetPerfLevel(kEnableCount);
|
|
|
|
ASSERT_OK(Put(1, "k1", "v1"));
|
|
|
|
ASSERT_OK(Put(1, "k2", "v2"));
|
|
|
|
ASSERT_OK(Put(1, "k3", "v3"));
|
|
|
|
ASSERT_OK(Put(1, "k4", "v4"));
|
2020-12-24 00:54:05 +00:00
|
|
|
ASSERT_OK(Flush(1));
|
2019-08-28 23:10:38 +00:00
|
|
|
ASSERT_OK(Put(1, "k5", "v5"));
|
|
|
|
const Snapshot* snap1 = dbfull()->GetSnapshot();
|
|
|
|
ASSERT_OK(Delete(1, "k4"));
|
2020-12-24 00:54:05 +00:00
|
|
|
ASSERT_OK(Flush(1));
|
2019-08-28 23:10:38 +00:00
|
|
|
const Snapshot* snap2 = dbfull()->GetSnapshot();
|
|
|
|
|
|
|
|
get_perf_context()->Reset();
|
|
|
|
|
|
|
|
std::vector<Slice> keys({"no_key", "k5", "k4", "k3", "k1"});
|
|
|
|
std::vector<PinnableSlice> values(keys.size());
|
|
|
|
std::vector<ColumnFamilyHandle*> cfs(keys.size(), handles_[1]);
|
|
|
|
std::vector<Status> s(keys.size());
|
|
|
|
|
|
|
|
ReadOptions ro;
|
|
|
|
bool use_snapshots = GetParam();
|
|
|
|
if (use_snapshots) {
|
|
|
|
ro.snapshot = snap2;
|
|
|
|
}
|
2019-09-03 15:50:47 +00:00
|
|
|
db_->MultiGet(ro, handles_[1], keys.size(), keys.data(), values.data(),
|
|
|
|
s.data(), false);
|
2019-08-28 23:10:38 +00:00
|
|
|
|
|
|
|
ASSERT_EQ(values.size(), keys.size());
|
|
|
|
ASSERT_EQ(std::string(values[4].data(), values[4].size()), "v1");
|
|
|
|
ASSERT_EQ(std::string(values[3].data(), values[3].size()), "v3");
|
|
|
|
ASSERT_EQ(std::string(values[1].data(), values[1].size()), "v5");
|
|
|
|
// four kv pairs * two bytes per value
|
|
|
|
ASSERT_EQ(6, (int)get_perf_context()->multiget_read_bytes);
|
|
|
|
|
|
|
|
ASSERT_TRUE(s[0].IsNotFound());
|
|
|
|
ASSERT_OK(s[1]);
|
|
|
|
ASSERT_TRUE(s[2].IsNotFound());
|
|
|
|
ASSERT_OK(s[3]);
|
|
|
|
ASSERT_OK(s[4]);
|
|
|
|
|
|
|
|
// Call MultiGet() again with some intersection with the previous set of
|
|
|
|
// keys. Those should already be in the row cache.
|
|
|
|
keys.assign({"no_key", "k5", "k3", "k2"});
|
|
|
|
for (size_t i = 0; i < keys.size(); ++i) {
|
|
|
|
values[i].Reset();
|
|
|
|
s[i] = Status::OK();
|
|
|
|
}
|
|
|
|
get_perf_context()->Reset();
|
|
|
|
|
|
|
|
if (use_snapshots) {
|
|
|
|
ro.snapshot = snap1;
|
|
|
|
}
|
|
|
|
db_->MultiGet(ReadOptions(), handles_[1], keys.size(), keys.data(),
|
|
|
|
values.data(), s.data(), false);
|
|
|
|
|
|
|
|
ASSERT_EQ(std::string(values[3].data(), values[3].size()), "v2");
|
|
|
|
ASSERT_EQ(std::string(values[2].data(), values[2].size()), "v3");
|
|
|
|
ASSERT_EQ(std::string(values[1].data(), values[1].size()), "v5");
|
|
|
|
// four kv pairs * two bytes per value
|
|
|
|
ASSERT_EQ(6, (int)get_perf_context()->multiget_read_bytes);
|
|
|
|
|
|
|
|
ASSERT_TRUE(s[0].IsNotFound());
|
|
|
|
ASSERT_OK(s[1]);
|
|
|
|
ASSERT_OK(s[2]);
|
|
|
|
ASSERT_OK(s[3]);
|
|
|
|
if (use_snapshots) {
|
|
|
|
// Only reads from the first SST file would have been cached, since
|
|
|
|
// snapshot seq no is > fd.largest_seqno
|
|
|
|
ASSERT_EQ(1, TestGetTickerCount(options, ROW_CACHE_HIT));
|
|
|
|
} else {
|
|
|
|
ASSERT_EQ(2, TestGetTickerCount(options, ROW_CACHE_HIT));
|
|
|
|
}
|
|
|
|
|
|
|
|
SetPerfLevel(kDisable);
|
|
|
|
dbfull()->ReleaseSnapshot(snap1);
|
|
|
|
dbfull()->ReleaseSnapshot(snap2);
|
|
|
|
} while (ChangeCompactOptions());
|
|
|
|
}
|
|
|
|
|
2020-06-03 22:53:09 +00:00
|
|
|
INSTANTIATE_TEST_CASE_P(DBMultiGetRowCacheTest, DBMultiGetRowCacheTest,
|
|
|
|
testing::Values(true, false));
|
2019-08-28 23:10:38 +00:00
|
|
|
|
2019-07-08 05:40:52 +00:00
|
|
|
TEST_F(DBBasicTest, GetAllKeyVersions) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.env = env_;
|
|
|
|
options.create_if_missing = true;
|
|
|
|
options.disable_auto_compactions = true;
|
|
|
|
CreateAndReopenWithCF({"pikachu"}, options);
|
|
|
|
ASSERT_EQ(2, handles_.size());
|
|
|
|
const size_t kNumInserts = 4;
|
|
|
|
const size_t kNumDeletes = 4;
|
|
|
|
const size_t kNumUpdates = 4;
|
|
|
|
|
|
|
|
// Check default column family
|
|
|
|
for (size_t i = 0; i != kNumInserts; ++i) {
|
|
|
|
ASSERT_OK(Put(std::to_string(i), "value"));
|
|
|
|
}
|
|
|
|
for (size_t i = 0; i != kNumUpdates; ++i) {
|
|
|
|
ASSERT_OK(Put(std::to_string(i), "value1"));
|
|
|
|
}
|
|
|
|
for (size_t i = 0; i != kNumDeletes; ++i) {
|
|
|
|
ASSERT_OK(Delete(std::to_string(i)));
|
|
|
|
}
|
|
|
|
std::vector<KeyVersion> key_versions;
|
2022-07-15 21:42:00 +00:00
|
|
|
ASSERT_OK(GetAllKeyVersions(db_, Slice(), Slice(),
|
|
|
|
std::numeric_limits<size_t>::max(),
|
|
|
|
&key_versions));
|
2019-07-08 05:40:52 +00:00
|
|
|
ASSERT_EQ(kNumInserts + kNumDeletes + kNumUpdates, key_versions.size());
|
2022-07-15 21:42:00 +00:00
|
|
|
for (size_t i = 0; i < kNumInserts + kNumDeletes + kNumUpdates; i++) {
|
|
|
|
if (i % 3 == 0) {
|
|
|
|
ASSERT_EQ(key_versions[i].GetTypeName(), "TypeDeletion");
|
|
|
|
} else {
|
|
|
|
ASSERT_EQ(key_versions[i].GetTypeName(), "TypeValue");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
ASSERT_OK(GetAllKeyVersions(db_, handles_[0], Slice(), Slice(),
|
|
|
|
std::numeric_limits<size_t>::max(),
|
|
|
|
&key_versions));
|
2019-07-08 05:40:52 +00:00
|
|
|
ASSERT_EQ(kNumInserts + kNumDeletes + kNumUpdates, key_versions.size());
|
|
|
|
|
|
|
|
// Check non-default column family
|
2020-06-02 22:02:44 +00:00
|
|
|
for (size_t i = 0; i + 1 != kNumInserts; ++i) {
|
2019-07-08 05:40:52 +00:00
|
|
|
ASSERT_OK(Put(1, std::to_string(i), "value"));
|
|
|
|
}
|
2020-06-02 22:02:44 +00:00
|
|
|
for (size_t i = 0; i + 1 != kNumUpdates; ++i) {
|
2019-07-08 05:40:52 +00:00
|
|
|
ASSERT_OK(Put(1, std::to_string(i), "value1"));
|
|
|
|
}
|
2020-06-02 22:02:44 +00:00
|
|
|
for (size_t i = 0; i + 1 != kNumDeletes; ++i) {
|
2019-07-08 05:40:52 +00:00
|
|
|
ASSERT_OK(Delete(1, std::to_string(i)));
|
|
|
|
}
|
2022-07-15 21:42:00 +00:00
|
|
|
ASSERT_OK(GetAllKeyVersions(db_, handles_[1], Slice(), Slice(),
|
|
|
|
std::numeric_limits<size_t>::max(),
|
|
|
|
&key_versions));
|
2019-07-08 05:40:52 +00:00
|
|
|
ASSERT_EQ(kNumInserts + kNumDeletes + kNumUpdates - 3, key_versions.size());
|
|
|
|
}
|
2022-07-15 21:42:00 +00:00
|
|
|
|
|
|
|
TEST_F(DBBasicTest, ValueTypeString) {
|
|
|
|
KeyVersion key_version;
|
|
|
|
// when adding new type, please also update `value_type_string_map`
|
|
|
|
for (unsigned char i = ValueType::kTypeDeletion; i < ValueType::kTypeMaxValid;
|
|
|
|
i++) {
|
|
|
|
key_version.type = i;
|
|
|
|
ASSERT_TRUE(key_version.GetTypeName() != "Invalid");
|
|
|
|
}
|
|
|
|
}
|
2019-07-08 05:40:52 +00:00
|
|
|
|
2019-11-12 00:57:49 +00:00
|
|
|
TEST_F(DBBasicTest, MultiGetIOBufferOverrun) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
Random rnd(301);
|
|
|
|
BlockBasedTableOptions table_options;
|
|
|
|
table_options.pin_l0_filter_and_index_blocks_in_cache = true;
|
|
|
|
table_options.block_size = 16 * 1024;
|
2020-03-24 18:21:10 +00:00
|
|
|
ASSERT_TRUE(table_options.block_size >
|
2022-11-02 21:34:24 +00:00
|
|
|
BlockBasedTable::kMultiGetReadStackBufSize);
|
Fix many tests to run with MEM_ENV and ENCRYPTED_ENV; Introduce a MemoryFileSystem class (#7566)
Summary:
This PR does a few things:
1. The MockFileSystem class was split out from the MockEnv. This change would theoretically allow a MockFileSystem to be used by other Environments as well (if we created a means of constructing one). The MockFileSystem implements a FileSystem in its entirety and does not rely on any Wrapper implementation.
2. Make the RocksDB test suite work when MOCK_ENV=1 and ENCRYPTED_ENV=1 are set. To accomplish this, a few things were needed:
- The tests that tried to use the "wrong" environment (Env::Default() instead of env_) were updated
- The MockFileSystem was changed to support the features it was missing or mishandled (such as recursively deleting files in a directory or supporting renaming of a directory).
3. Updated the test framework to have a ROCKSDB_GTEST_SKIP macro. This can be used to flag tests that are skipped. Currently, this defaults to doing nothing (marks the test as SUCCESS) but will mark the tests as SKIPPED when RocksDB is upgraded to a version of gtest that supports this (gtest-1.10).
I have run a full "make check" with MEM_ENV, ENCRYPTED_ENV, both, and neither under both MacOS and RedHat. A few tests were disabled/skipped for the MEM/ENCRYPTED cases. The error_handler_fs_test fails/hangs for MEM_ENV (presumably a timing problem) and I will introduce another PR/issue to track that problem. (I will also push a change to disable those tests soon). There is one more test in DBTest2 that also fails which I need to investigate or skip before this PR is merged.
Theoretically, this PR should also allow the test suite to run against an Env loaded from the registry, though I do not have one to try it with currently.
Finally, once this is accepted, it would be nice if there was a CircleCI job to run these tests on a checkin so this effort does not become stale. I do not know how to do that, so if someone could write that job, it would be appreciated :)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7566
Reviewed By: zhichao-cao
Differential Revision: D24408980
Pulled By: jay-zhuang
fbshipit-source-id: 911b1554a4d0da06fd51feca0c090a4abdcb4a5f
2020-10-27 17:31:34 +00:00
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
2019-11-12 00:57:49 +00:00
|
|
|
Reopen(options);
|
|
|
|
|
|
|
|
std::string zero_str(128, '\0');
|
|
|
|
for (int i = 0; i < 100; ++i) {
|
|
|
|
// Make the value compressible. A purely random string doesn't compress
|
|
|
|
// and the resultant data block will not be compressed
|
2020-07-09 21:33:42 +00:00
|
|
|
std::string value(rnd.RandomString(128) + zero_str);
|
2019-11-12 00:57:49 +00:00
|
|
|
assert(Put(Key(i), value) == Status::OK());
|
|
|
|
}
|
2020-12-24 00:54:05 +00:00
|
|
|
ASSERT_OK(Flush());
|
2019-11-12 00:57:49 +00:00
|
|
|
|
|
|
|
std::vector<std::string> key_data(10);
|
|
|
|
std::vector<Slice> keys;
|
|
|
|
// We cannot resize a PinnableSlice vector, so just set initial size to
|
|
|
|
// largest we think we will need
|
|
|
|
std::vector<PinnableSlice> values(10);
|
|
|
|
std::vector<Status> statuses;
|
|
|
|
ReadOptions ro;
|
|
|
|
|
|
|
|
// Warm up the cache first
|
|
|
|
key_data.emplace_back(Key(0));
|
|
|
|
keys.emplace_back(Slice(key_data.back()));
|
|
|
|
key_data.emplace_back(Key(50));
|
|
|
|
keys.emplace_back(Slice(key_data.back()));
|
|
|
|
statuses.resize(keys.size());
|
|
|
|
|
|
|
|
dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
|
|
|
|
keys.data(), values.data(), statuses.data(), true);
|
|
|
|
}
|
|
|
|
|
2020-03-21 02:17:54 +00:00
|
|
|
TEST_F(DBBasicTest, IncrementalRecoveryNoCorrupt) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
CreateAndReopenWithCF({"pikachu", "eevee"}, options);
|
|
|
|
size_t num_cfs = handles_.size();
|
|
|
|
ASSERT_EQ(3, num_cfs);
|
|
|
|
WriteOptions write_opts;
|
|
|
|
write_opts.disableWAL = true;
|
|
|
|
for (size_t cf = 0; cf != num_cfs; ++cf) {
|
|
|
|
for (size_t i = 0; i != 10000; ++i) {
|
|
|
|
std::string key_str = Key(static_cast<int>(i));
|
|
|
|
std::string value_str = std::to_string(cf) + "_" + std::to_string(i);
|
|
|
|
|
|
|
|
ASSERT_OK(Put(static_cast<int>(cf), key_str, value_str));
|
|
|
|
if (0 == (i % 1000)) {
|
|
|
|
ASSERT_OK(Flush(static_cast<int>(cf)));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
for (size_t cf = 0; cf != num_cfs; ++cf) {
|
|
|
|
ASSERT_OK(Flush(static_cast<int>(cf)));
|
|
|
|
}
|
|
|
|
Close();
|
|
|
|
options.best_efforts_recovery = true;
|
|
|
|
ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"},
|
|
|
|
options);
|
|
|
|
num_cfs = handles_.size();
|
|
|
|
ASSERT_EQ(3, num_cfs);
|
|
|
|
for (size_t cf = 0; cf != num_cfs; ++cf) {
|
|
|
|
for (int i = 0; i != 10000; ++i) {
|
|
|
|
std::string key_str = Key(static_cast<int>(i));
|
|
|
|
std::string expected_value_str =
|
|
|
|
std::to_string(cf) + "_" + std::to_string(i);
|
|
|
|
ASSERT_EQ(expected_value_str, Get(static_cast<int>(cf), key_str));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-05-05 17:44:12 +00:00
|
|
|
TEST_F(DBBasicTest, BestEffortsRecoveryWithVersionBuildingFailure) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
ASSERT_OK(Put("foo", "value"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
SyncPoint::GetInstance()->ClearAllCallBacks();
|
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"VersionBuilder::CheckConsistencyBeforeReturn", [&](void* arg) {
|
|
|
|
ASSERT_NE(nullptr, arg);
|
|
|
|
*(reinterpret_cast<Status*>(arg)) =
|
|
|
|
Status::Corruption("Inject corruption");
|
|
|
|
});
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
options.best_efforts_recovery = true;
|
|
|
|
Status s = TryReopen(options);
|
|
|
|
ASSERT_TRUE(s.IsCorruption());
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
SyncPoint::GetInstance()->ClearAllCallBacks();
|
|
|
|
}
|
|
|
|
|
2020-03-21 02:17:54 +00:00
|
|
|
namespace {
|
|
|
|
class TableFileListener : public EventListener {
|
|
|
|
public:
|
|
|
|
void OnTableFileCreated(const TableFileCreationInfo& info) override {
|
|
|
|
InstrumentedMutexLock lock(&mutex_);
|
|
|
|
cf_to_paths_[info.cf_name].push_back(info.file_path);
|
|
|
|
}
|
|
|
|
std::vector<std::string>& GetFiles(const std::string& cf_name) {
|
|
|
|
InstrumentedMutexLock lock(&mutex_);
|
|
|
|
return cf_to_paths_[cf_name];
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
InstrumentedMutex mutex_;
|
|
|
|
std::unordered_map<std::string, std::vector<std::string>> cf_to_paths_;
|
|
|
|
};
|
2022-11-02 21:34:24 +00:00
|
|
|
} // anonymous namespace
|
2020-03-21 02:17:54 +00:00
|
|
|
|
Fix a recovery corner case (#7621)
Summary:
Consider the following sequence of events:
1. Db flushed an SST with file number N, appended to MANIFEST, and tried to sync the MANIFEST.
2. Syncing MANIFEST failed and db crashed.
3. Db tried to recover with this MANIFEST. In the meantime, no entry about the newly-flushed SST was found in the MANIFEST. Therefore, RocksDB replayed WAL and tried to flush to an SST file reusing the same file number N. This failed because file system does not support overwrite. Then Db deleted this file.
4. Db crashed again.
5. Db tried to recover. When db read the MANIFEST, there was an entry referencing N.sst. This could happen probably because the append in step 1 finally reached the MANIFEST and became visible. Since N.sst had been deleted in step 3, recovery failed.
It is possible that N.sst created in step 1 is valid. Although step 3 would still fail since the MANIFEST was not synced properly in step 1 and 2, deleting N.sst would make it impossible for the db to recover even if the remaining part of MANIFEST was appended and visible after step 5.
After this PR, in step 3, immediately after recovering from MANIFEST, a new MANIFEST is created, then we find that N.sst is not referenced in the MANIFEST, so we delete it, and we'll not reuse N as file number. Then in step 5, since the new MANIFEST does not contain N.sst, the recovery failure situation in step 5 won't happen.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7621
Test Plan:
1. some tests are updated, because these tests assume that new MANIFEST is created after WAL recovery.
2. a new unit test is added in db_basic_test to simulate step 3.
Reviewed By: riversand963
Differential Revision: D24668144
Pulled By: cheng-chang
fbshipit-source-id: 90d7487fbad2bc3714f5ede46ea949895b15ae3b
2020-11-08 05:54:55 +00:00
|
|
|
TEST_F(DBBasicTest, LastSstFileNotInManifest) {
|
|
|
|
// If the last sst file is not tracked in MANIFEST,
|
|
|
|
// or the VersionEdit for the last sst file is not synced,
|
|
|
|
// on recovery, the last sst file should be deleted,
|
|
|
|
// and new sst files shouldn't reuse its file number.
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
Close();
|
|
|
|
|
|
|
|
// Manually add a sst file.
|
|
|
|
constexpr uint64_t kSstFileNumber = 100;
|
|
|
|
const std::string kSstFile = MakeTableFileName(dbname_, kSstFileNumber);
|
|
|
|
ASSERT_OK(WriteStringToFile(env_, /* data = */ "bad sst file content",
|
|
|
|
/* fname = */ kSstFile,
|
|
|
|
/* should_sync = */ true));
|
|
|
|
ASSERT_OK(env_->FileExists(kSstFile));
|
|
|
|
|
|
|
|
TableFileListener* listener = new TableFileListener();
|
|
|
|
options.listeners.emplace_back(listener);
|
|
|
|
Reopen(options);
|
|
|
|
// kSstFile should already be deleted.
|
|
|
|
ASSERT_TRUE(env_->FileExists(kSstFile).IsNotFound());
|
|
|
|
|
|
|
|
ASSERT_OK(Put("k", "v"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
// New sst file should have file number > kSstFileNumber.
|
|
|
|
std::vector<std::string>& files =
|
|
|
|
listener->GetFiles(kDefaultColumnFamilyName);
|
|
|
|
ASSERT_EQ(files.size(), 1);
|
|
|
|
const std::string fname = files[0].erase(0, (dbname_ + "/").size());
|
|
|
|
uint64_t number = 0;
|
|
|
|
FileType type = kTableFile;
|
|
|
|
ASSERT_TRUE(ParseFileName(fname, &number, &type));
|
|
|
|
ASSERT_EQ(type, kTableFile);
|
|
|
|
ASSERT_GT(number, kSstFileNumber);
|
|
|
|
}
|
|
|
|
|
2020-03-21 02:17:54 +00:00
|
|
|
TEST_F(DBBasicTest, RecoverWithMissingFiles) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
TableFileListener* listener = new TableFileListener();
|
|
|
|
// Disable auto compaction to simplify SST file name tracking.
|
|
|
|
options.disable_auto_compactions = true;
|
|
|
|
options.listeners.emplace_back(listener);
|
|
|
|
CreateAndReopenWithCF({"pikachu", "eevee"}, options);
|
|
|
|
std::vector<std::string> all_cf_names = {kDefaultColumnFamilyName, "pikachu",
|
|
|
|
"eevee"};
|
|
|
|
size_t num_cfs = handles_.size();
|
|
|
|
ASSERT_EQ(3, num_cfs);
|
|
|
|
for (size_t cf = 0; cf != num_cfs; ++cf) {
|
|
|
|
ASSERT_OK(Put(static_cast<int>(cf), "a", "0_value"));
|
|
|
|
ASSERT_OK(Flush(static_cast<int>(cf)));
|
|
|
|
ASSERT_OK(Put(static_cast<int>(cf), "b", "0_value"));
|
|
|
|
ASSERT_OK(Flush(static_cast<int>(cf)));
|
|
|
|
ASSERT_OK(Put(static_cast<int>(cf), "c", "0_value"));
|
|
|
|
ASSERT_OK(Flush(static_cast<int>(cf)));
|
|
|
|
}
|
|
|
|
|
2020-05-08 19:59:02 +00:00
|
|
|
// Delete and corrupt files
|
2020-03-21 02:17:54 +00:00
|
|
|
for (size_t i = 0; i < all_cf_names.size(); ++i) {
|
|
|
|
std::vector<std::string>& files = listener->GetFiles(all_cf_names[i]);
|
|
|
|
ASSERT_EQ(3, files.size());
|
2020-05-08 19:59:02 +00:00
|
|
|
std::string corrupted_data;
|
|
|
|
ASSERT_OK(ReadFileToString(env_, files[files.size() - 1], &corrupted_data));
|
|
|
|
ASSERT_OK(WriteStringToFile(
|
|
|
|
env_, corrupted_data.substr(0, corrupted_data.size() - 2),
|
|
|
|
files[files.size() - 1], /*should_sync=*/true));
|
|
|
|
for (int j = static_cast<int>(files.size() - 2); j >= static_cast<int>(i);
|
2020-03-21 02:17:54 +00:00
|
|
|
--j) {
|
|
|
|
ASSERT_OK(env_->DeleteFile(files[j]));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
options.best_efforts_recovery = true;
|
|
|
|
ReopenWithColumnFamilies(all_cf_names, options);
|
|
|
|
// Verify data
|
|
|
|
ReadOptions read_opts;
|
|
|
|
read_opts.total_order_seek = true;
|
|
|
|
{
|
|
|
|
std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts, handles_[0]));
|
|
|
|
iter->SeekToFirst();
|
|
|
|
ASSERT_FALSE(iter->Valid());
|
2020-09-29 16:47:33 +00:00
|
|
|
ASSERT_OK(iter->status());
|
2020-03-21 02:17:54 +00:00
|
|
|
iter.reset(db_->NewIterator(read_opts, handles_[1]));
|
|
|
|
iter->SeekToFirst();
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ("a", iter->key());
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_FALSE(iter->Valid());
|
2020-09-29 16:47:33 +00:00
|
|
|
ASSERT_OK(iter->status());
|
2020-03-21 02:17:54 +00:00
|
|
|
iter.reset(db_->NewIterator(read_opts, handles_[2]));
|
|
|
|
iter->SeekToFirst();
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ("a", iter->key());
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ("b", iter->key());
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_FALSE(iter->Valid());
|
2020-09-29 16:47:33 +00:00
|
|
|
ASSERT_OK(iter->status());
|
2020-03-21 02:17:54 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-05-08 19:59:02 +00:00
|
|
|
TEST_F(DBBasicTest, BestEffortsRecoveryTryMultipleManifests) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.env = env_;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
ASSERT_OK(Put("foo", "value0"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
Close();
|
|
|
|
{
|
|
|
|
// Hack by adding a new MANIFEST with high file number
|
|
|
|
std::string garbage(10, '\0');
|
|
|
|
ASSERT_OK(WriteStringToFile(env_, garbage, dbname_ + "/MANIFEST-001000",
|
|
|
|
/*should_sync=*/true));
|
|
|
|
}
|
|
|
|
{
|
|
|
|
// Hack by adding a corrupted SST not referenced by any MANIFEST
|
|
|
|
std::string garbage(10, '\0');
|
|
|
|
ASSERT_OK(WriteStringToFile(env_, garbage, dbname_ + "/001001.sst",
|
|
|
|
/*should_sync=*/true));
|
|
|
|
}
|
|
|
|
|
|
|
|
options.best_efforts_recovery = true;
|
|
|
|
|
|
|
|
Reopen(options);
|
|
|
|
ASSERT_OK(Put("bar", "value"));
|
|
|
|
}
|
|
|
|
|
2020-04-23 23:18:28 +00:00
|
|
|
TEST_F(DBBasicTest, RecoverWithNoCurrentFile) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.env = env_;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
CreateAndReopenWithCF({"pikachu"}, options);
|
|
|
|
options.best_efforts_recovery = true;
|
|
|
|
ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options);
|
|
|
|
ASSERT_EQ(2, handles_.size());
|
|
|
|
ASSERT_OK(Put("foo", "value"));
|
|
|
|
ASSERT_OK(Put(1, "bar", "value"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_OK(Flush(1));
|
|
|
|
Close();
|
|
|
|
ASSERT_OK(env_->DeleteFile(CurrentFileName(dbname_)));
|
|
|
|
ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options);
|
|
|
|
std::vector<std::string> cf_names;
|
|
|
|
ASSERT_OK(DB::ListColumnFamilies(DBOptions(options), dbname_, &cf_names));
|
|
|
|
ASSERT_EQ(2, cf_names.size());
|
|
|
|
for (const auto& name : cf_names) {
|
|
|
|
ASSERT_TRUE(name == kDefaultColumnFamilyName || name == "pikachu");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-06-15 21:09:53 +00:00
|
|
|
TEST_F(DBBasicTest, RecoverWithNoManifest) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.env = env_;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
ASSERT_OK(Put("foo", "value"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
Close();
|
|
|
|
{
|
|
|
|
// Delete all MANIFEST.
|
|
|
|
std::vector<std::string> files;
|
|
|
|
ASSERT_OK(env_->GetChildren(dbname_, &files));
|
|
|
|
for (const auto& file : files) {
|
|
|
|
uint64_t number = 0;
|
2020-10-23 00:04:39 +00:00
|
|
|
FileType type = kWalFile;
|
2020-06-15 21:09:53 +00:00
|
|
|
if (ParseFileName(file, &number, &type) && type == kDescriptorFile) {
|
|
|
|
ASSERT_OK(env_->DeleteFile(dbname_ + "/" + file));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
options.best_efforts_recovery = true;
|
|
|
|
options.create_if_missing = false;
|
|
|
|
Status s = TryReopen(options);
|
|
|
|
ASSERT_TRUE(s.IsInvalidArgument());
|
|
|
|
options.create_if_missing = true;
|
|
|
|
Reopen(options);
|
|
|
|
// Since no MANIFEST exists, best-efforts recovery creates a new, empty db.
|
|
|
|
ASSERT_EQ("NOT_FOUND", Get("foo"));
|
|
|
|
}
|
|
|
|
|
2020-03-21 02:17:54 +00:00
|
|
|
TEST_F(DBBasicTest, SkipWALIfMissingTableFiles) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
TableFileListener* listener = new TableFileListener();
|
|
|
|
options.listeners.emplace_back(listener);
|
|
|
|
CreateAndReopenWithCF({"pikachu"}, options);
|
|
|
|
std::vector<std::string> kAllCfNames = {kDefaultColumnFamilyName, "pikachu"};
|
|
|
|
size_t num_cfs = handles_.size();
|
|
|
|
ASSERT_EQ(2, num_cfs);
|
|
|
|
for (int cf = 0; cf < static_cast<int>(kAllCfNames.size()); ++cf) {
|
|
|
|
ASSERT_OK(Put(cf, "a", "0_value"));
|
|
|
|
ASSERT_OK(Flush(cf));
|
|
|
|
ASSERT_OK(Put(cf, "b", "0_value"));
|
|
|
|
}
|
|
|
|
// Delete files
|
|
|
|
for (size_t i = 0; i < kAllCfNames.size(); ++i) {
|
|
|
|
std::vector<std::string>& files = listener->GetFiles(kAllCfNames[i]);
|
|
|
|
ASSERT_EQ(1, files.size());
|
|
|
|
for (int j = static_cast<int>(files.size() - 1); j >= static_cast<int>(i);
|
|
|
|
--j) {
|
|
|
|
ASSERT_OK(env_->DeleteFile(files[j]));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
options.best_efforts_recovery = true;
|
|
|
|
ReopenWithColumnFamilies(kAllCfNames, options);
|
|
|
|
// Verify WAL is not applied
|
|
|
|
ReadOptions read_opts;
|
|
|
|
read_opts.total_order_seek = true;
|
|
|
|
std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts, handles_[0]));
|
|
|
|
iter->SeekToFirst();
|
|
|
|
ASSERT_FALSE(iter->Valid());
|
2020-09-29 16:47:33 +00:00
|
|
|
ASSERT_OK(iter->status());
|
2020-03-21 02:17:54 +00:00
|
|
|
iter.reset(db_->NewIterator(read_opts, handles_[1]));
|
|
|
|
iter->SeekToFirst();
|
|
|
|
ASSERT_TRUE(iter->Valid());
|
|
|
|
ASSERT_EQ("a", iter->key());
|
|
|
|
iter->Next();
|
|
|
|
ASSERT_FALSE(iter->Valid());
|
2020-09-29 16:47:33 +00:00
|
|
|
ASSERT_OK(iter->status());
|
2020-03-21 02:17:54 +00:00
|
|
|
}
|
2020-12-10 00:56:26 +00:00
|
|
|
|
|
|
|
TEST_F(DBBasicTest, DisableTrackWal) {
|
|
|
|
// If WAL tracking was enabled, and then disabled during reopen,
|
|
|
|
// the previously tracked WALs should be removed from MANIFEST.
|
|
|
|
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.track_and_verify_wals_in_manifest = true;
|
|
|
|
// extremely small write buffer size,
|
|
|
|
// so that new WALs are created more frequently.
|
|
|
|
options.write_buffer_size = 100;
|
|
|
|
options.env = env_;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
for (int i = 0; i < 100; i++) {
|
|
|
|
ASSERT_OK(Put("foo" + std::to_string(i), "value" + std::to_string(i)));
|
|
|
|
}
|
|
|
|
ASSERT_OK(dbfull()->TEST_SwitchMemtable());
|
|
|
|
ASSERT_OK(db_->SyncWAL());
|
|
|
|
// Some WALs are tracked.
|
2021-12-10 19:03:39 +00:00
|
|
|
ASSERT_FALSE(dbfull()->GetVersionSet()->GetWalSet().GetWals().empty());
|
2020-12-10 00:56:26 +00:00
|
|
|
Close();
|
|
|
|
|
|
|
|
// Disable WAL tracking.
|
|
|
|
options.track_and_verify_wals_in_manifest = false;
|
|
|
|
options.create_if_missing = false;
|
|
|
|
ASSERT_OK(TryReopen(options));
|
|
|
|
// Previously tracked WALs are cleared.
|
2021-12-10 19:03:39 +00:00
|
|
|
ASSERT_TRUE(dbfull()->GetVersionSet()->GetWalSet().GetWals().empty());
|
2020-12-10 00:56:26 +00:00
|
|
|
Close();
|
|
|
|
|
|
|
|
// Re-enable WAL tracking again.
|
|
|
|
options.track_and_verify_wals_in_manifest = true;
|
|
|
|
options.create_if_missing = false;
|
|
|
|
ASSERT_OK(TryReopen(options));
|
2021-12-10 19:03:39 +00:00
|
|
|
ASSERT_TRUE(dbfull()->GetVersionSet()->GetWalSet().GetWals().empty());
|
2020-12-10 00:56:26 +00:00
|
|
|
Close();
|
|
|
|
}
|
2020-03-21 02:17:54 +00:00
|
|
|
|
2020-06-18 17:07:42 +00:00
|
|
|
TEST_F(DBBasicTest, ManifestChecksumMismatch) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
ASSERT_OK(Put("bar", "value"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
SyncPoint::GetInstance()->ClearAllCallBacks();
|
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"LogWriter::EmitPhysicalRecord:BeforeEncodeChecksum", [&](void* arg) {
|
|
|
|
auto* crc = reinterpret_cast<uint32_t*>(arg);
|
|
|
|
*crc = *crc + 1;
|
|
|
|
});
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
WriteOptions write_opts;
|
|
|
|
write_opts.disableWAL = true;
|
|
|
|
Status s = db_->Put(write_opts, "foo", "value");
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
SyncPoint::GetInstance()->ClearAllCallBacks();
|
|
|
|
ASSERT_OK(Put("foo", "value1"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
s = TryReopen(options);
|
|
|
|
ASSERT_TRUE(s.IsCorruption());
|
|
|
|
}
|
|
|
|
|
2021-10-19 03:31:32 +00:00
|
|
|
TEST_F(DBBasicTest, ConcurrentlyCloseDB) {
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
std::vector<std::thread> workers;
|
|
|
|
for (int i = 0; i < 10; i++) {
|
|
|
|
workers.push_back(std::thread([&]() {
|
|
|
|
auto s = db_->Close();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
}));
|
|
|
|
}
|
|
|
|
for (auto& w : workers) {
|
|
|
|
w.join();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Do not track obsolete WALs in MANIFEST even if they are synced (#7725)
Summary:
Consider the case:
1. All column families are flushed, so all WALs become obsolete, but no WAL is removed from disk yet because the removal is asynchronous, a VersionEdit is written to MANIFEST indicating that WALs before a certain WAL number are obsolete, let's say this number is 3;
2. `SyncWAL` is called, so all the on-disk WALs are synced, and if track_and_verify_wal_in_manifest=true, the WALs will be tracked in MANIFEST, let's say the WAL numbers are 1 and 2;
3. DB crashes;
4. During DB recovery, when replaying MANIFEST, we first see that WAL with number < 3 are obsolete, then we see that WAL 1 and 2 are synced, so according to current implementation of `WalSet`, the `WalSet` will be recovered to include WAL 1 and 2;
5. WAL 1 and 2 are asynchronously deleted from disk, then the WAL verification algorithm fails with `Corruption: missing WAL`.
The above case is reproduced in a new unit test `DBBasicTestTrackWal::DoNotTrackObsoleteWal`.
The fix is to maintain the upper bound of the obsolete WAL numbers, any WAL with number less than the maintained number is considered to be obsolete, so shouldn't be tracked even if they are later synced. The number is maintained in `WalSet`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7725
Test Plan:
1. a new unit test `DBBasicTestTrackWal::DoNotTrackObsoleteWal` is added.
2. run `make crash_test` on devserver.
Reviewed By: riversand963
Differential Revision: D25238914
Pulled By: cheng-chang
fbshipit-source-id: f5dccd57c3d89f19565ec5731f2d42f06d272b72
2020-12-08 18:56:50 +00:00
|
|
|
class DBBasicTestTrackWal : public DBTestBase,
|
|
|
|
public testing::WithParamInterface<bool> {
|
|
|
|
public:
|
|
|
|
DBBasicTestTrackWal()
|
2021-07-23 15:37:27 +00:00
|
|
|
: DBTestBase("db_basic_test_track_wal", /*env_do_fsync=*/false) {}
|
Do not track obsolete WALs in MANIFEST even if they are synced (#7725)
Summary:
Consider the case:
1. All column families are flushed, so all WALs become obsolete, but no WAL is removed from disk yet because the removal is asynchronous, a VersionEdit is written to MANIFEST indicating that WALs before a certain WAL number are obsolete, let's say this number is 3;
2. `SyncWAL` is called, so all the on-disk WALs are synced, and if track_and_verify_wal_in_manifest=true, the WALs will be tracked in MANIFEST, let's say the WAL numbers are 1 and 2;
3. DB crashes;
4. During DB recovery, when replaying MANIFEST, we first see that WAL with number < 3 are obsolete, then we see that WAL 1 and 2 are synced, so according to current implementation of `WalSet`, the `WalSet` will be recovered to include WAL 1 and 2;
5. WAL 1 and 2 are asynchronously deleted from disk, then the WAL verification algorithm fails with `Corruption: missing WAL`.
The above case is reproduced in a new unit test `DBBasicTestTrackWal::DoNotTrackObsoleteWal`.
The fix is to maintain the upper bound of the obsolete WAL numbers, any WAL with number less than the maintained number is considered to be obsolete, so shouldn't be tracked even if they are later synced. The number is maintained in `WalSet`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7725
Test Plan:
1. a new unit test `DBBasicTestTrackWal::DoNotTrackObsoleteWal` is added.
2. run `make crash_test` on devserver.
Reviewed By: riversand963
Differential Revision: D25238914
Pulled By: cheng-chang
fbshipit-source-id: f5dccd57c3d89f19565ec5731f2d42f06d272b72
2020-12-08 18:56:50 +00:00
|
|
|
|
|
|
|
int CountWalFiles() {
|
|
|
|
VectorLogPtr log_files;
|
2020-12-24 00:54:05 +00:00
|
|
|
EXPECT_OK(dbfull()->GetSortedWalFiles(log_files));
|
Do not track obsolete WALs in MANIFEST even if they are synced (#7725)
Summary:
Consider the case:
1. All column families are flushed, so all WALs become obsolete, but no WAL is removed from disk yet because the removal is asynchronous, a VersionEdit is written to MANIFEST indicating that WALs before a certain WAL number are obsolete, let's say this number is 3;
2. `SyncWAL` is called, so all the on-disk WALs are synced, and if track_and_verify_wal_in_manifest=true, the WALs will be tracked in MANIFEST, let's say the WAL numbers are 1 and 2;
3. DB crashes;
4. During DB recovery, when replaying MANIFEST, we first see that WAL with number < 3 are obsolete, then we see that WAL 1 and 2 are synced, so according to current implementation of `WalSet`, the `WalSet` will be recovered to include WAL 1 and 2;
5. WAL 1 and 2 are asynchronously deleted from disk, then the WAL verification algorithm fails with `Corruption: missing WAL`.
The above case is reproduced in a new unit test `DBBasicTestTrackWal::DoNotTrackObsoleteWal`.
The fix is to maintain the upper bound of the obsolete WAL numbers, any WAL with number less than the maintained number is considered to be obsolete, so shouldn't be tracked even if they are later synced. The number is maintained in `WalSet`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7725
Test Plan:
1. a new unit test `DBBasicTestTrackWal::DoNotTrackObsoleteWal` is added.
2. run `make crash_test` on devserver.
Reviewed By: riversand963
Differential Revision: D25238914
Pulled By: cheng-chang
fbshipit-source-id: f5dccd57c3d89f19565ec5731f2d42f06d272b72
2020-12-08 18:56:50 +00:00
|
|
|
return static_cast<int>(log_files.size());
|
|
|
|
};
|
|
|
|
};
|
|
|
|
|
|
|
|
TEST_P(DBBasicTestTrackWal, DoNotTrackObsoleteWal) {
|
|
|
|
// If a WAL becomes obsolete after flushing, but is not deleted from disk yet,
|
|
|
|
// then if SyncWAL is called afterwards, the obsolete WAL should not be
|
|
|
|
// tracked in MANIFEST.
|
|
|
|
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.create_if_missing = true;
|
|
|
|
options.track_and_verify_wals_in_manifest = true;
|
|
|
|
options.atomic_flush = GetParam();
|
|
|
|
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
CreateAndReopenWithCF({"cf"}, options);
|
|
|
|
ASSERT_EQ(handles_.size(), 2); // default, cf
|
|
|
|
// Do not delete WALs.
|
|
|
|
ASSERT_OK(db_->DisableFileDeletions());
|
|
|
|
constexpr int n = 10;
|
|
|
|
std::vector<std::unique_ptr<LogFile>> wals(n);
|
|
|
|
for (size_t i = 0; i < n; i++) {
|
|
|
|
// Generate a new WAL for each key-value.
|
|
|
|
const int cf = i % 2;
|
|
|
|
ASSERT_OK(db_->GetCurrentWalFile(&wals[i]));
|
|
|
|
ASSERT_OK(Put(cf, "k" + std::to_string(i), "v" + std::to_string(i)));
|
|
|
|
ASSERT_OK(Flush({0, 1}));
|
|
|
|
}
|
|
|
|
ASSERT_EQ(CountWalFiles(), n);
|
|
|
|
// Since all WALs are obsolete, no WAL should be tracked in MANIFEST.
|
|
|
|
ASSERT_OK(db_->SyncWAL());
|
|
|
|
|
|
|
|
// Manually delete all WALs.
|
|
|
|
Close();
|
|
|
|
for (const auto& wal : wals) {
|
|
|
|
ASSERT_OK(env_->DeleteFile(LogFileName(dbname_, wal->LogNumber())));
|
|
|
|
}
|
|
|
|
|
|
|
|
// If SyncWAL tracks the obsolete WALs in MANIFEST,
|
|
|
|
// reopen will fail because the WALs are missing from disk.
|
|
|
|
ASSERT_OK(TryReopenWithColumnFamilies({"default", "cf"}, options));
|
|
|
|
Destroy(options);
|
|
|
|
}
|
|
|
|
|
|
|
|
INSTANTIATE_TEST_CASE_P(DBBasicTestTrackWal, DBBasicTestTrackWal,
|
|
|
|
testing::Bool());
|
|
|
|
|
2020-04-21 21:49:50 +00:00
|
|
|
class DBBasicTestMultiGet : public DBTestBase {
|
2019-07-01 03:52:34 +00:00
|
|
|
public:
|
2023-01-25 01:09:19 +00:00
|
|
|
DBBasicTestMultiGet(std::string test_dir, int num_cfs,
|
2020-06-03 18:37:12 +00:00
|
|
|
bool uncompressed_cache, bool _compression_enabled,
|
|
|
|
bool _fill_cache, uint32_t compression_parallel_threads)
|
2020-09-16 01:53:53 +00:00
|
|
|
: DBTestBase(test_dir, /*env_do_fsync=*/false) {
|
2020-06-03 18:37:12 +00:00
|
|
|
compression_enabled_ = _compression_enabled;
|
|
|
|
fill_cache_ = _fill_cache;
|
2019-07-01 03:52:34 +00:00
|
|
|
|
|
|
|
if (uncompressed_cache) {
|
|
|
|
std::shared_ptr<Cache> cache = NewLRUCache(1048576);
|
|
|
|
uncompressed_cache_ = std::make_shared<MyBlockCache>(cache);
|
|
|
|
}
|
|
|
|
|
|
|
|
env_->count_random_reads_ = true;
|
|
|
|
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
Random rnd(301);
|
|
|
|
BlockBasedTableOptions table_options;
|
2019-11-18 17:35:37 +00:00
|
|
|
|
|
|
|
if (compression_enabled_) {
|
|
|
|
std::vector<CompressionType> compression_types;
|
|
|
|
compression_types = GetSupportedCompressions();
|
|
|
|
// Not every platform may have compression libraries available, so
|
|
|
|
// dynamically pick based on what's available
|
2020-03-24 01:47:34 +00:00
|
|
|
CompressionType tmp_type = kNoCompression;
|
|
|
|
for (auto c_type : compression_types) {
|
|
|
|
if (c_type != kNoCompression) {
|
|
|
|
tmp_type = c_type;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (tmp_type != kNoCompression) {
|
|
|
|
options.compression = tmp_type;
|
2019-11-18 17:35:37 +00:00
|
|
|
} else {
|
2020-03-24 01:47:34 +00:00
|
|
|
compression_enabled_ = false;
|
2019-11-18 17:35:37 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-07-01 03:52:34 +00:00
|
|
|
table_options.block_cache = uncompressed_cache_;
|
2019-11-07 20:00:45 +00:00
|
|
|
if (table_options.block_cache == nullptr) {
|
|
|
|
table_options.no_block_cache = true;
|
|
|
|
} else {
|
|
|
|
table_options.pin_l0_filter_and_index_blocks_in_cache = true;
|
|
|
|
}
|
2019-07-01 03:52:34 +00:00
|
|
|
table_options.flush_block_policy_factory.reset(
|
2019-09-20 19:00:55 +00:00
|
|
|
new MyFlushBlockPolicyFactory());
|
Fix many tests to run with MEM_ENV and ENCRYPTED_ENV; Introduce a MemoryFileSystem class (#7566)
Summary:
This PR does a few things:
1. The MockFileSystem class was split out from the MockEnv. This change would theoretically allow a MockFileSystem to be used by other Environments as well (if we created a means of constructing one). The MockFileSystem implements a FileSystem in its entirety and does not rely on any Wrapper implementation.
2. Make the RocksDB test suite work when MOCK_ENV=1 and ENCRYPTED_ENV=1 are set. To accomplish this, a few things were needed:
- The tests that tried to use the "wrong" environment (Env::Default() instead of env_) were updated
- The MockFileSystem was changed to support the features it was missing or mishandled (such as recursively deleting files in a directory or supporting renaming of a directory).
3. Updated the test framework to have a ROCKSDB_GTEST_SKIP macro. This can be used to flag tests that are skipped. Currently, this defaults to doing nothing (marks the test as SUCCESS) but will mark the tests as SKIPPED when RocksDB is upgraded to a version of gtest that supports this (gtest-1.10).
I have run a full "make check" with MEM_ENV, ENCRYPTED_ENV, both, and neither under both MacOS and RedHat. A few tests were disabled/skipped for the MEM/ENCRYPTED cases. The error_handler_fs_test fails/hangs for MEM_ENV (presumably a timing problem) and I will introduce another PR/issue to track that problem. (I will also push a change to disable those tests soon). There is one more test in DBTest2 that also fails which I need to investigate or skip before this PR is merged.
Theoretically, this PR should also allow the test suite to run against an Env loaded from the registry, though I do not have one to try it with currently.
Finally, once this is accepted, it would be nice if there was a CircleCI job to run these tests on a checkin so this effort does not become stale. I do not know how to do that, so if someone could write that job, it would be appreciated :)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7566
Reviewed By: zhichao-cao
Differential Revision: D24408980
Pulled By: jay-zhuang
fbshipit-source-id: 911b1554a4d0da06fd51feca0c090a4abdcb4a5f
2020-10-27 17:31:34 +00:00
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
2019-07-01 03:52:34 +00:00
|
|
|
if (!compression_enabled_) {
|
|
|
|
options.compression = kNoCompression;
|
2020-04-01 23:37:54 +00:00
|
|
|
} else {
|
|
|
|
options.compression_opts.parallel_threads = compression_parallel_threads;
|
2019-07-01 03:52:34 +00:00
|
|
|
}
|
2020-12-10 00:57:54 +00:00
|
|
|
options_ = options;
|
2019-07-01 03:52:34 +00:00
|
|
|
Reopen(options);
|
|
|
|
|
2020-04-21 21:49:50 +00:00
|
|
|
if (num_cfs > 1) {
|
|
|
|
for (int cf = 0; cf < num_cfs; ++cf) {
|
|
|
|
cf_names_.emplace_back("cf" + std::to_string(cf));
|
|
|
|
}
|
|
|
|
CreateColumnFamilies(cf_names_, options);
|
|
|
|
cf_names_.emplace_back("default");
|
2019-07-01 03:52:34 +00:00
|
|
|
}
|
Merge adjacent file block reads in RocksDB MultiGet() and Add uncompressed block to cache (#6089)
Summary:
In the current MultiGet, if the KV-pairs do not belong to the data blocks in the block cache, multiple blocks are read from a SST. It will trigger one block read for each block request and read them in parallel. In some cases, if some data blocks are adjacent in the SST, the reads for these blocks can be combined to a single large read, which can reduce the system calls and reduce the read latency if possible.
Considering to fill the block cache, if multiple data blocks are in the same memory buffer, we need to copy them to the heap separately. Therefore, only in the case that 1) data block compression is enabled, and 2) compressed block cache is null, we can do combined read. Otherwise, extra memory copy is needed, which may cause extra overhead. In the current case, data blocks will be uncompressed to a new memory space.
Also, in the case that 1) data block compression is enabled, and 2) compressed block cache is null, it is possible the data block is actually not compressed. In the current logic, these data blocks will not be added to the uncompressed_cache. So if memory buffer is shared and the data block is not compressed, the data block are copied to the head and fill the cache.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6089
Test Plan: Added test case to ParallelIO.MultiGet. Pass make asan_check
Differential Revision: D18734668
Pulled By: zhichao-cao
fbshipit-source-id: 67c5615ed373e51e42635fd74b36f8f3a66d5da4
2019-12-16 23:55:33 +00:00
|
|
|
|
2020-04-21 21:49:50 +00:00
|
|
|
std::string zero_str(128, '\0');
|
|
|
|
for (int cf = 0; cf < num_cfs; ++cf) {
|
|
|
|
for (int i = 0; i < 100; ++i) {
|
|
|
|
// Make the value compressible. A purely random string doesn't compress
|
|
|
|
// and the resultant data block will not be compressed
|
2020-07-09 21:33:42 +00:00
|
|
|
values_.emplace_back(rnd.RandomString(128) + zero_str);
|
2020-04-21 21:49:50 +00:00
|
|
|
assert(((num_cfs == 1) ? Put(Key(i), values_[i])
|
|
|
|
: Put(cf, Key(i), values_[i])) == Status::OK());
|
|
|
|
}
|
|
|
|
if (num_cfs == 1) {
|
2020-12-24 00:54:05 +00:00
|
|
|
EXPECT_OK(Flush());
|
2020-04-21 21:49:50 +00:00
|
|
|
} else {
|
2020-12-24 00:54:05 +00:00
|
|
|
EXPECT_OK(dbfull()->Flush(FlushOptions(), handles_[cf]));
|
2020-04-21 21:49:50 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
for (int i = 0; i < 100; ++i) {
|
|
|
|
// block cannot gain space by compression
|
2020-07-09 21:33:42 +00:00
|
|
|
uncompressable_values_.emplace_back(rnd.RandomString(256) + '\0');
|
2020-04-21 21:49:50 +00:00
|
|
|
std::string tmp_key = "a" + Key(i);
|
|
|
|
assert(((num_cfs == 1) ? Put(tmp_key, uncompressable_values_[i])
|
|
|
|
: Put(cf, tmp_key, uncompressable_values_[i])) ==
|
|
|
|
Status::OK());
|
|
|
|
}
|
|
|
|
if (num_cfs == 1) {
|
2020-12-24 00:54:05 +00:00
|
|
|
EXPECT_OK(Flush());
|
2020-04-21 21:49:50 +00:00
|
|
|
} else {
|
2020-12-24 00:54:05 +00:00
|
|
|
EXPECT_OK(dbfull()->Flush(FlushOptions(), handles_[cf]));
|
2020-04-21 21:49:50 +00:00
|
|
|
}
|
Merge adjacent file block reads in RocksDB MultiGet() and Add uncompressed block to cache (#6089)
Summary:
In the current MultiGet, if the KV-pairs do not belong to the data blocks in the block cache, multiple blocks are read from a SST. It will trigger one block read for each block request and read them in parallel. In some cases, if some data blocks are adjacent in the SST, the reads for these blocks can be combined to a single large read, which can reduce the system calls and reduce the read latency if possible.
Considering to fill the block cache, if multiple data blocks are in the same memory buffer, we need to copy them to the heap separately. Therefore, only in the case that 1) data block compression is enabled, and 2) compressed block cache is null, we can do combined read. Otherwise, extra memory copy is needed, which may cause extra overhead. In the current case, data blocks will be uncompressed to a new memory space.
Also, in the case that 1) data block compression is enabled, and 2) compressed block cache is null, it is possible the data block is actually not compressed. In the current logic, these data blocks will not be added to the uncompressed_cache. So if memory buffer is shared and the data block is not compressed, the data block are copied to the head and fill the cache.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6089
Test Plan: Added test case to ParallelIO.MultiGet. Pass make asan_check
Differential Revision: D18734668
Pulled By: zhichao-cao
fbshipit-source-id: 67c5615ed373e51e42635fd74b36f8f3a66d5da4
2019-12-16 23:55:33 +00:00
|
|
|
}
|
2021-08-17 03:36:19 +00:00
|
|
|
// Clear compressed cache, which is always pre-populated
|
|
|
|
if (compressed_cache_) {
|
|
|
|
compressed_cache_->SetCapacity(0);
|
|
|
|
compressed_cache_->SetCapacity(1048576);
|
|
|
|
}
|
2019-07-01 03:52:34 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
bool CheckValue(int i, const std::string& value) {
|
|
|
|
if (values_[i].compare(value) == 0) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
Merge adjacent file block reads in RocksDB MultiGet() and Add uncompressed block to cache (#6089)
Summary:
In the current MultiGet, if the KV-pairs do not belong to the data blocks in the block cache, multiple blocks are read from a SST. It will trigger one block read for each block request and read them in parallel. In some cases, if some data blocks are adjacent in the SST, the reads for these blocks can be combined to a single large read, which can reduce the system calls and reduce the read latency if possible.
Considering to fill the block cache, if multiple data blocks are in the same memory buffer, we need to copy them to the heap separately. Therefore, only in the case that 1) data block compression is enabled, and 2) compressed block cache is null, we can do combined read. Otherwise, extra memory copy is needed, which may cause extra overhead. In the current case, data blocks will be uncompressed to a new memory space.
Also, in the case that 1) data block compression is enabled, and 2) compressed block cache is null, it is possible the data block is actually not compressed. In the current logic, these data blocks will not be added to the uncompressed_cache. So if memory buffer is shared and the data block is not compressed, the data block are copied to the head and fill the cache.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6089
Test Plan: Added test case to ParallelIO.MultiGet. Pass make asan_check
Differential Revision: D18734668
Pulled By: zhichao-cao
fbshipit-source-id: 67c5615ed373e51e42635fd74b36f8f3a66d5da4
2019-12-16 23:55:33 +00:00
|
|
|
bool CheckUncompressableValue(int i, const std::string& value) {
|
|
|
|
if (uncompressable_values_[i].compare(value) == 0) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2020-04-21 21:49:50 +00:00
|
|
|
const std::vector<std::string>& GetCFNames() const { return cf_names_; }
|
|
|
|
|
2019-07-01 03:52:34 +00:00
|
|
|
int num_lookups() { return uncompressed_cache_->num_lookups(); }
|
|
|
|
int num_found() { return uncompressed_cache_->num_found(); }
|
|
|
|
int num_inserts() { return uncompressed_cache_->num_inserts(); }
|
|
|
|
|
2019-09-20 19:00:55 +00:00
|
|
|
int num_lookups_compressed() { return compressed_cache_->num_lookups(); }
|
|
|
|
int num_found_compressed() { return compressed_cache_->num_found(); }
|
|
|
|
int num_inserts_compressed() { return compressed_cache_->num_inserts(); }
|
2019-07-01 03:52:34 +00:00
|
|
|
|
|
|
|
bool fill_cache() { return fill_cache_; }
|
2019-11-07 20:00:45 +00:00
|
|
|
bool compression_enabled() { return compression_enabled_; }
|
|
|
|
bool has_compressed_cache() { return compressed_cache_ != nullptr; }
|
|
|
|
bool has_uncompressed_cache() { return uncompressed_cache_ != nullptr; }
|
2020-12-10 00:57:54 +00:00
|
|
|
Options get_options() { return options_; }
|
2019-07-01 03:52:34 +00:00
|
|
|
|
|
|
|
static void SetUpTestCase() {}
|
|
|
|
static void TearDownTestCase() {}
|
|
|
|
|
2020-04-21 21:49:50 +00:00
|
|
|
protected:
|
2019-09-20 19:00:55 +00:00
|
|
|
class MyFlushBlockPolicyFactory : public FlushBlockPolicyFactory {
|
2019-07-01 03:52:34 +00:00
|
|
|
public:
|
|
|
|
MyFlushBlockPolicyFactory() {}
|
|
|
|
|
|
|
|
virtual const char* Name() const override {
|
|
|
|
return "MyFlushBlockPolicyFactory";
|
|
|
|
}
|
|
|
|
|
|
|
|
virtual FlushBlockPolicy* NewFlushBlockPolicy(
|
|
|
|
const BlockBasedTableOptions& /*table_options*/,
|
|
|
|
const BlockBuilder& data_block_builder) const override {
|
|
|
|
return new MyFlushBlockPolicy(data_block_builder);
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2019-09-20 19:00:55 +00:00
|
|
|
class MyFlushBlockPolicy : public FlushBlockPolicy {
|
2019-07-01 03:52:34 +00:00
|
|
|
public:
|
|
|
|
explicit MyFlushBlockPolicy(const BlockBuilder& data_block_builder)
|
2019-09-20 19:00:55 +00:00
|
|
|
: num_keys_(0), data_block_builder_(data_block_builder) {}
|
2019-07-01 03:52:34 +00:00
|
|
|
|
|
|
|
bool Update(const Slice& /*key*/, const Slice& /*value*/) override {
|
|
|
|
if (data_block_builder_.empty()) {
|
|
|
|
// First key in this block
|
|
|
|
num_keys_ = 1;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
// Flush every 10 keys
|
|
|
|
if (num_keys_ == 10) {
|
|
|
|
num_keys_ = 1;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
num_keys_++;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
int num_keys_;
|
|
|
|
const BlockBuilder& data_block_builder_;
|
|
|
|
};
|
|
|
|
|
2020-04-27 20:18:18 +00:00
|
|
|
class MyBlockCache : public CacheWrapper {
|
2019-07-01 03:52:34 +00:00
|
|
|
public:
|
2020-04-27 20:18:18 +00:00
|
|
|
explicit MyBlockCache(std::shared_ptr<Cache> target)
|
|
|
|
: CacheWrapper(target),
|
|
|
|
num_lookups_(0),
|
|
|
|
num_found_(0),
|
|
|
|
num_inserts_(0) {}
|
|
|
|
|
|
|
|
const char* Name() const override { return "MyBlockCache"; }
|
|
|
|
|
Major Cache refactoring, CPU efficiency improvement (#10975)
Summary:
This is several refactorings bundled into one to avoid having to incrementally re-modify uses of Cache several times. Overall, there are breaking changes to Cache class, and it becomes more of low-level interface for implementing caches, especially block cache. New internal APIs make using Cache cleaner than before, and more insulated from block cache evolution. Hopefully, this is the last really big block cache refactoring, because of rather effectively decoupling the implementations from the uses. This change also removes the EXPERIMENTAL designation on the SecondaryCache support in Cache. It seems reasonably mature at this point but still subject to change/evolution (as I warn in the API docs for Cache).
The high-level motivation for this refactoring is to minimize code duplication / compounding complexity in adding SecondaryCache support to HyperClockCache (in a later PR). Other benefits listed below.
* static_cast lines of code +29 -35 (net removed 6)
* reinterpret_cast lines of code +6 -32 (net removed 26)
## cache.h and secondary_cache.h
* Always use CacheItemHelper with entries instead of just a Deleter. There are several motivations / justifications:
* Simpler for implementations to deal with just one Insert and one Lookup.
* Simpler and more efficient implementation because we don't have to track which entries are using helpers and which are using deleters
* Gets rid of hack to classify cache entries by their deleter. Instead, the CacheItemHelper includes a CacheEntryRole. This simplifies a lot of code (cache_entry_roles.h almost eliminated). Fixes https://github.com/facebook/rocksdb/issues/9428.
* Makes it trivial to adjust SecondaryCache behavior based on kind of block (e.g. don't re-compress filter blocks).
* It is arguably less convenient for many direct users of Cache, but direct users of Cache are now rare with introduction of typed_cache.h (below).
* I considered and rejected an alternative approach in which we reduce customizability by assuming each secondary cache compatible value starts with a Slice referencing the uncompressed block contents (already true or mostly true), but we apparently intend to stack secondary caches. Saving an entry from a compressed secondary to a lower tier requires custom handling offered by SaveToCallback, etc.
* Make CreateCallback part of the helper and introduce CreateContext to work with it (alternative to https://github.com/facebook/rocksdb/issues/10562). This cleans up the interface while still allowing context to be provided for loading/parsing values into primary cache. This model works for async lookup in BlockBasedTable reader (reader owns a CreateContext) under the assumption that it always waits on secondary cache operations to finish. (Otherwise, the CreateContext could be destroyed while async operation depending on it continues.) This likely contributes most to the observed performance improvement because it saves an std::function backed by a heap allocation.
* Use char* for serialized data, e.g. in SaveToCallback, where void* was confusingly used. (We use `char*` for serialized byte data all over RocksDB, with many advantages over `void*`. `memcpy` etc. are legacy APIs that should not be mimicked.)
* Add a type alias Cache::ObjectPtr = void*, so that we can better indicate the intent of the void* when it is to be the object associated with a Cache entry. Related: started (but did not complete) a refactoring to move away from "value" of a cache entry toward "object" or "obj". (It is confusing to call Cache a key-value store (like DB) when it is really storing arbitrary in-memory objects, not byte strings.)
* Remove unnecessary key param from DeleterFn. This is good for efficiency in HyperClockCache, which does not directly store the cache key in memory. (Alternative to https://github.com/facebook/rocksdb/issues/10774)
* Add allocator to Cache DeleterFn. This is a kind of future-proofing change in case we get more serious about using the Cache allocator for memory tracked by the Cache. Right now, only the uncompressed block contents are allocated using the allocator, and a pointer to that allocator is saved as part of the cached object so that the deleter can use it. (See CacheAllocationPtr.) If in the future we are able to "flatten out" our Cache objects some more, it would be good not to have to track the allocator as part of each object.
* Removes legacy `ApplyToAllCacheEntries` and changes `ApplyToAllEntries` signature for Deleter->CacheItemHelper change.
## typed_cache.h
Adds various "typed" interfaces to the Cache as internal APIs, so that most uses of Cache can use simple type safe code without casting and without explicit deleters, etc. Almost all of the non-test, non-glue code uses of Cache have been migrated. (Follow-up work: CompressedSecondaryCache deserves deeper attention to migrate.) This change expands RocksDB's internal usage of metaprogramming and SFINAE (https://en.cppreference.com/w/cpp/language/sfinae).
The existing usages of Cache are divided up at a high level into these new interfaces. See updated existing uses of Cache for examples of how these are used.
* PlaceholderCacheInterface - Used for making cache reservations, with entries that have a charge but no value.
* BasicTypedCacheInterface<TValue> - Used for primary cache storage of objects of type TValue, which can be cleaned up with std::default_delete<TValue>. The role is provided by TValue::kCacheEntryRole or given in an optional template parameter.
* FullTypedCacheInterface<TValue, TCreateContext> - Used for secondary cache compatible storage of objects of type TValue. In addition to BasicTypedCacheInterface constraints, we require TValue::ContentSlice() to return persistable data. This simplifies usage for the normal case of simple secondary cache compatibility (can give you a Slice to the data already in memory). In addition to TCreateContext performing the role of Cache::CreateContext, it is also expected to provide a factory function for creating TValue.
* For each of these, there's a "Shared" version (e.g. FullTypedSharedCacheInterface) that holds a shared_ptr to the Cache, rather than assuming external ownership by holding only a raw `Cache*`.
These interfaces introduce specific handle types for each interface instantiation, so that it's easy to see what kind of object is controlled by a handle. (Ultimately, this might not be worth the extra complexity, but it seems OK so far.)
Note: I attempted to make the cache 'charge' automatically inferred from the cache object type, such as by expecting an ApproximateMemoryUsage() function, but this is not so clean because there are cases where we need to compute the charge ahead of time and don't want to re-compute it.
## block_cache.h
This header is essentially the replacement for the old block_like_traits.h. It includes various things to support block cache access with typed_cache.h for block-based table.
## block_based_table_reader.cc
Before this change, accessing the block cache here was an awkward mix of static polymorphism (template TBlocklike) and switch-case on a dynamic BlockType value. This change mostly unifies on static polymorphism, relying on minor hacks in block_cache.h to distinguish variants of Block. We still check BlockType in some places (especially for stats, which could be improved in follow-up work) but at least the BlockType is a static constant from the template parameter. (No more awkward partial redundancy between static and dynamic info.) This likely contributes to the overall performance improvement, but hasn't been tested in isolation.
The other key source of simplification here is a more unified system of creating block cache objects: for directly populating from primary cache and for promotion from secondary cache. Both use BlockCreateContext, for context and for factory functions.
## block_based_table_builder.cc, cache_dump_load_impl.cc
Before this change, warming caches was super ugly code. Both of these source files had switch statements to basically transition from the dynamic BlockType world to the static TBlocklike world. None of that mess is needed anymore as there's a new, untyped WarmInCache function that handles all the details just as promotion from SecondaryCache would. (Fixes `TODO akanksha: Dedup below code` in block_based_table_builder.cc.)
## Everything else
Mostly just updating Cache users to use new typed APIs when reasonably possible, or changed Cache APIs when not.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10975
Test Plan:
tests updated
Performance test setup similar to https://github.com/facebook/rocksdb/issues/10626 (by cache size, LRUCache when not "hyper" for HyperClockCache):
34MB 1thread base.hyper -> kops/s: 0.745 io_bytes/op: 2.52504e+06 miss_ratio: 0.140906 max_rss_mb: 76.4844
34MB 1thread new.hyper -> kops/s: 0.751 io_bytes/op: 2.5123e+06 miss_ratio: 0.140161 max_rss_mb: 79.3594
34MB 1thread base -> kops/s: 0.254 io_bytes/op: 1.36073e+07 miss_ratio: 0.918818 max_rss_mb: 45.9297
34MB 1thread new -> kops/s: 0.252 io_bytes/op: 1.36157e+07 miss_ratio: 0.918999 max_rss_mb: 44.1523
34MB 32thread base.hyper -> kops/s: 7.272 io_bytes/op: 2.88323e+06 miss_ratio: 0.162532 max_rss_mb: 516.602
34MB 32thread new.hyper -> kops/s: 7.214 io_bytes/op: 2.99046e+06 miss_ratio: 0.168818 max_rss_mb: 518.293
34MB 32thread base -> kops/s: 3.528 io_bytes/op: 1.35722e+07 miss_ratio: 0.914691 max_rss_mb: 264.926
34MB 32thread new -> kops/s: 3.604 io_bytes/op: 1.35744e+07 miss_ratio: 0.915054 max_rss_mb: 264.488
233MB 1thread base.hyper -> kops/s: 53.909 io_bytes/op: 2552.35 miss_ratio: 0.0440566 max_rss_mb: 241.984
233MB 1thread new.hyper -> kops/s: 62.792 io_bytes/op: 2549.79 miss_ratio: 0.044043 max_rss_mb: 241.922
233MB 1thread base -> kops/s: 1.197 io_bytes/op: 2.75173e+06 miss_ratio: 0.103093 max_rss_mb: 241.559
233MB 1thread new -> kops/s: 1.199 io_bytes/op: 2.73723e+06 miss_ratio: 0.10305 max_rss_mb: 240.93
233MB 32thread base.hyper -> kops/s: 1298.69 io_bytes/op: 2539.12 miss_ratio: 0.0440307 max_rss_mb: 371.418
233MB 32thread new.hyper -> kops/s: 1421.35 io_bytes/op: 2538.75 miss_ratio: 0.0440307 max_rss_mb: 347.273
233MB 32thread base -> kops/s: 9.693 io_bytes/op: 2.77304e+06 miss_ratio: 0.103745 max_rss_mb: 569.691
233MB 32thread new -> kops/s: 9.75 io_bytes/op: 2.77559e+06 miss_ratio: 0.103798 max_rss_mb: 552.82
1597MB 1thread base.hyper -> kops/s: 58.607 io_bytes/op: 1449.14 miss_ratio: 0.0249324 max_rss_mb: 1583.55
1597MB 1thread new.hyper -> kops/s: 69.6 io_bytes/op: 1434.89 miss_ratio: 0.0247167 max_rss_mb: 1584.02
1597MB 1thread base -> kops/s: 60.478 io_bytes/op: 1421.28 miss_ratio: 0.024452 max_rss_mb: 1589.45
1597MB 1thread new -> kops/s: 63.973 io_bytes/op: 1416.07 miss_ratio: 0.0243766 max_rss_mb: 1589.24
1597MB 32thread base.hyper -> kops/s: 1436.2 io_bytes/op: 1357.93 miss_ratio: 0.0235353 max_rss_mb: 1692.92
1597MB 32thread new.hyper -> kops/s: 1605.03 io_bytes/op: 1358.04 miss_ratio: 0.023538 max_rss_mb: 1702.78
1597MB 32thread base -> kops/s: 280.059 io_bytes/op: 1350.34 miss_ratio: 0.023289 max_rss_mb: 1675.36
1597MB 32thread new -> kops/s: 283.125 io_bytes/op: 1351.05 miss_ratio: 0.0232797 max_rss_mb: 1703.83
Almost uniformly improving over base revision, especially for hot paths with HyperClockCache, up to 12% higher throughput seen (1597MB, 32thread, hyper). The improvement for that is likely coming from much simplified code for providing context for secondary cache promotion (CreateCallback/CreateContext), and possibly from less branching in block_based_table_reader. And likely a small improvement from not reconstituting key for DeleterFn.
Reviewed By: anand1976
Differential Revision: D42417818
Pulled By: pdillinger
fbshipit-source-id: f86bfdd584dce27c028b151ba56818ad14f7a432
2023-01-11 22:20:40 +00:00
|
|
|
Status Insert(const Slice& key, Cache::ObjectPtr value,
|
|
|
|
const CacheItemHelper* helper, size_t charge,
|
2020-04-27 20:18:18 +00:00
|
|
|
Handle** handle = nullptr,
|
|
|
|
Priority priority = Priority::LOW) override {
|
2019-07-01 03:52:34 +00:00
|
|
|
num_inserts_++;
|
Major Cache refactoring, CPU efficiency improvement (#10975)
Summary:
This is several refactorings bundled into one to avoid having to incrementally re-modify uses of Cache several times. Overall, there are breaking changes to Cache class, and it becomes more of low-level interface for implementing caches, especially block cache. New internal APIs make using Cache cleaner than before, and more insulated from block cache evolution. Hopefully, this is the last really big block cache refactoring, because of rather effectively decoupling the implementations from the uses. This change also removes the EXPERIMENTAL designation on the SecondaryCache support in Cache. It seems reasonably mature at this point but still subject to change/evolution (as I warn in the API docs for Cache).
The high-level motivation for this refactoring is to minimize code duplication / compounding complexity in adding SecondaryCache support to HyperClockCache (in a later PR). Other benefits listed below.
* static_cast lines of code +29 -35 (net removed 6)
* reinterpret_cast lines of code +6 -32 (net removed 26)
## cache.h and secondary_cache.h
* Always use CacheItemHelper with entries instead of just a Deleter. There are several motivations / justifications:
* Simpler for implementations to deal with just one Insert and one Lookup.
* Simpler and more efficient implementation because we don't have to track which entries are using helpers and which are using deleters
* Gets rid of hack to classify cache entries by their deleter. Instead, the CacheItemHelper includes a CacheEntryRole. This simplifies a lot of code (cache_entry_roles.h almost eliminated). Fixes https://github.com/facebook/rocksdb/issues/9428.
* Makes it trivial to adjust SecondaryCache behavior based on kind of block (e.g. don't re-compress filter blocks).
* It is arguably less convenient for many direct users of Cache, but direct users of Cache are now rare with introduction of typed_cache.h (below).
* I considered and rejected an alternative approach in which we reduce customizability by assuming each secondary cache compatible value starts with a Slice referencing the uncompressed block contents (already true or mostly true), but we apparently intend to stack secondary caches. Saving an entry from a compressed secondary to a lower tier requires custom handling offered by SaveToCallback, etc.
* Make CreateCallback part of the helper and introduce CreateContext to work with it (alternative to https://github.com/facebook/rocksdb/issues/10562). This cleans up the interface while still allowing context to be provided for loading/parsing values into primary cache. This model works for async lookup in BlockBasedTable reader (reader owns a CreateContext) under the assumption that it always waits on secondary cache operations to finish. (Otherwise, the CreateContext could be destroyed while async operation depending on it continues.) This likely contributes most to the observed performance improvement because it saves an std::function backed by a heap allocation.
* Use char* for serialized data, e.g. in SaveToCallback, where void* was confusingly used. (We use `char*` for serialized byte data all over RocksDB, with many advantages over `void*`. `memcpy` etc. are legacy APIs that should not be mimicked.)
* Add a type alias Cache::ObjectPtr = void*, so that we can better indicate the intent of the void* when it is to be the object associated with a Cache entry. Related: started (but did not complete) a refactoring to move away from "value" of a cache entry toward "object" or "obj". (It is confusing to call Cache a key-value store (like DB) when it is really storing arbitrary in-memory objects, not byte strings.)
* Remove unnecessary key param from DeleterFn. This is good for efficiency in HyperClockCache, which does not directly store the cache key in memory. (Alternative to https://github.com/facebook/rocksdb/issues/10774)
* Add allocator to Cache DeleterFn. This is a kind of future-proofing change in case we get more serious about using the Cache allocator for memory tracked by the Cache. Right now, only the uncompressed block contents are allocated using the allocator, and a pointer to that allocator is saved as part of the cached object so that the deleter can use it. (See CacheAllocationPtr.) If in the future we are able to "flatten out" our Cache objects some more, it would be good not to have to track the allocator as part of each object.
* Removes legacy `ApplyToAllCacheEntries` and changes `ApplyToAllEntries` signature for Deleter->CacheItemHelper change.
## typed_cache.h
Adds various "typed" interfaces to the Cache as internal APIs, so that most uses of Cache can use simple type safe code without casting and without explicit deleters, etc. Almost all of the non-test, non-glue code uses of Cache have been migrated. (Follow-up work: CompressedSecondaryCache deserves deeper attention to migrate.) This change expands RocksDB's internal usage of metaprogramming and SFINAE (https://en.cppreference.com/w/cpp/language/sfinae).
The existing usages of Cache are divided up at a high level into these new interfaces. See updated existing uses of Cache for examples of how these are used.
* PlaceholderCacheInterface - Used for making cache reservations, with entries that have a charge but no value.
* BasicTypedCacheInterface<TValue> - Used for primary cache storage of objects of type TValue, which can be cleaned up with std::default_delete<TValue>. The role is provided by TValue::kCacheEntryRole or given in an optional template parameter.
* FullTypedCacheInterface<TValue, TCreateContext> - Used for secondary cache compatible storage of objects of type TValue. In addition to BasicTypedCacheInterface constraints, we require TValue::ContentSlice() to return persistable data. This simplifies usage for the normal case of simple secondary cache compatibility (can give you a Slice to the data already in memory). In addition to TCreateContext performing the role of Cache::CreateContext, it is also expected to provide a factory function for creating TValue.
* For each of these, there's a "Shared" version (e.g. FullTypedSharedCacheInterface) that holds a shared_ptr to the Cache, rather than assuming external ownership by holding only a raw `Cache*`.
These interfaces introduce specific handle types for each interface instantiation, so that it's easy to see what kind of object is controlled by a handle. (Ultimately, this might not be worth the extra complexity, but it seems OK so far.)
Note: I attempted to make the cache 'charge' automatically inferred from the cache object type, such as by expecting an ApproximateMemoryUsage() function, but this is not so clean because there are cases where we need to compute the charge ahead of time and don't want to re-compute it.
## block_cache.h
This header is essentially the replacement for the old block_like_traits.h. It includes various things to support block cache access with typed_cache.h for block-based table.
## block_based_table_reader.cc
Before this change, accessing the block cache here was an awkward mix of static polymorphism (template TBlocklike) and switch-case on a dynamic BlockType value. This change mostly unifies on static polymorphism, relying on minor hacks in block_cache.h to distinguish variants of Block. We still check BlockType in some places (especially for stats, which could be improved in follow-up work) but at least the BlockType is a static constant from the template parameter. (No more awkward partial redundancy between static and dynamic info.) This likely contributes to the overall performance improvement, but hasn't been tested in isolation.
The other key source of simplification here is a more unified system of creating block cache objects: for directly populating from primary cache and for promotion from secondary cache. Both use BlockCreateContext, for context and for factory functions.
## block_based_table_builder.cc, cache_dump_load_impl.cc
Before this change, warming caches was super ugly code. Both of these source files had switch statements to basically transition from the dynamic BlockType world to the static TBlocklike world. None of that mess is needed anymore as there's a new, untyped WarmInCache function that handles all the details just as promotion from SecondaryCache would. (Fixes `TODO akanksha: Dedup below code` in block_based_table_builder.cc.)
## Everything else
Mostly just updating Cache users to use new typed APIs when reasonably possible, or changed Cache APIs when not.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10975
Test Plan:
tests updated
Performance test setup similar to https://github.com/facebook/rocksdb/issues/10626 (by cache size, LRUCache when not "hyper" for HyperClockCache):
34MB 1thread base.hyper -> kops/s: 0.745 io_bytes/op: 2.52504e+06 miss_ratio: 0.140906 max_rss_mb: 76.4844
34MB 1thread new.hyper -> kops/s: 0.751 io_bytes/op: 2.5123e+06 miss_ratio: 0.140161 max_rss_mb: 79.3594
34MB 1thread base -> kops/s: 0.254 io_bytes/op: 1.36073e+07 miss_ratio: 0.918818 max_rss_mb: 45.9297
34MB 1thread new -> kops/s: 0.252 io_bytes/op: 1.36157e+07 miss_ratio: 0.918999 max_rss_mb: 44.1523
34MB 32thread base.hyper -> kops/s: 7.272 io_bytes/op: 2.88323e+06 miss_ratio: 0.162532 max_rss_mb: 516.602
34MB 32thread new.hyper -> kops/s: 7.214 io_bytes/op: 2.99046e+06 miss_ratio: 0.168818 max_rss_mb: 518.293
34MB 32thread base -> kops/s: 3.528 io_bytes/op: 1.35722e+07 miss_ratio: 0.914691 max_rss_mb: 264.926
34MB 32thread new -> kops/s: 3.604 io_bytes/op: 1.35744e+07 miss_ratio: 0.915054 max_rss_mb: 264.488
233MB 1thread base.hyper -> kops/s: 53.909 io_bytes/op: 2552.35 miss_ratio: 0.0440566 max_rss_mb: 241.984
233MB 1thread new.hyper -> kops/s: 62.792 io_bytes/op: 2549.79 miss_ratio: 0.044043 max_rss_mb: 241.922
233MB 1thread base -> kops/s: 1.197 io_bytes/op: 2.75173e+06 miss_ratio: 0.103093 max_rss_mb: 241.559
233MB 1thread new -> kops/s: 1.199 io_bytes/op: 2.73723e+06 miss_ratio: 0.10305 max_rss_mb: 240.93
233MB 32thread base.hyper -> kops/s: 1298.69 io_bytes/op: 2539.12 miss_ratio: 0.0440307 max_rss_mb: 371.418
233MB 32thread new.hyper -> kops/s: 1421.35 io_bytes/op: 2538.75 miss_ratio: 0.0440307 max_rss_mb: 347.273
233MB 32thread base -> kops/s: 9.693 io_bytes/op: 2.77304e+06 miss_ratio: 0.103745 max_rss_mb: 569.691
233MB 32thread new -> kops/s: 9.75 io_bytes/op: 2.77559e+06 miss_ratio: 0.103798 max_rss_mb: 552.82
1597MB 1thread base.hyper -> kops/s: 58.607 io_bytes/op: 1449.14 miss_ratio: 0.0249324 max_rss_mb: 1583.55
1597MB 1thread new.hyper -> kops/s: 69.6 io_bytes/op: 1434.89 miss_ratio: 0.0247167 max_rss_mb: 1584.02
1597MB 1thread base -> kops/s: 60.478 io_bytes/op: 1421.28 miss_ratio: 0.024452 max_rss_mb: 1589.45
1597MB 1thread new -> kops/s: 63.973 io_bytes/op: 1416.07 miss_ratio: 0.0243766 max_rss_mb: 1589.24
1597MB 32thread base.hyper -> kops/s: 1436.2 io_bytes/op: 1357.93 miss_ratio: 0.0235353 max_rss_mb: 1692.92
1597MB 32thread new.hyper -> kops/s: 1605.03 io_bytes/op: 1358.04 miss_ratio: 0.023538 max_rss_mb: 1702.78
1597MB 32thread base -> kops/s: 280.059 io_bytes/op: 1350.34 miss_ratio: 0.023289 max_rss_mb: 1675.36
1597MB 32thread new -> kops/s: 283.125 io_bytes/op: 1351.05 miss_ratio: 0.0232797 max_rss_mb: 1703.83
Almost uniformly improving over base revision, especially for hot paths with HyperClockCache, up to 12% higher throughput seen (1597MB, 32thread, hyper). The improvement for that is likely coming from much simplified code for providing context for secondary cache promotion (CreateCallback/CreateContext), and possibly from less branching in block_based_table_reader. And likely a small improvement from not reconstituting key for DeleterFn.
Reviewed By: anand1976
Differential Revision: D42417818
Pulled By: pdillinger
fbshipit-source-id: f86bfdd584dce27c028b151ba56818ad14f7a432
2023-01-11 22:20:40 +00:00
|
|
|
return target_->Insert(key, value, helper, charge, handle, priority);
|
2019-07-01 03:52:34 +00:00
|
|
|
}
|
|
|
|
|
Major Cache refactoring, CPU efficiency improvement (#10975)
Summary:
This is several refactorings bundled into one to avoid having to incrementally re-modify uses of Cache several times. Overall, there are breaking changes to Cache class, and it becomes more of low-level interface for implementing caches, especially block cache. New internal APIs make using Cache cleaner than before, and more insulated from block cache evolution. Hopefully, this is the last really big block cache refactoring, because of rather effectively decoupling the implementations from the uses. This change also removes the EXPERIMENTAL designation on the SecondaryCache support in Cache. It seems reasonably mature at this point but still subject to change/evolution (as I warn in the API docs for Cache).
The high-level motivation for this refactoring is to minimize code duplication / compounding complexity in adding SecondaryCache support to HyperClockCache (in a later PR). Other benefits listed below.
* static_cast lines of code +29 -35 (net removed 6)
* reinterpret_cast lines of code +6 -32 (net removed 26)
## cache.h and secondary_cache.h
* Always use CacheItemHelper with entries instead of just a Deleter. There are several motivations / justifications:
* Simpler for implementations to deal with just one Insert and one Lookup.
* Simpler and more efficient implementation because we don't have to track which entries are using helpers and which are using deleters
* Gets rid of hack to classify cache entries by their deleter. Instead, the CacheItemHelper includes a CacheEntryRole. This simplifies a lot of code (cache_entry_roles.h almost eliminated). Fixes https://github.com/facebook/rocksdb/issues/9428.
* Makes it trivial to adjust SecondaryCache behavior based on kind of block (e.g. don't re-compress filter blocks).
* It is arguably less convenient for many direct users of Cache, but direct users of Cache are now rare with introduction of typed_cache.h (below).
* I considered and rejected an alternative approach in which we reduce customizability by assuming each secondary cache compatible value starts with a Slice referencing the uncompressed block contents (already true or mostly true), but we apparently intend to stack secondary caches. Saving an entry from a compressed secondary to a lower tier requires custom handling offered by SaveToCallback, etc.
* Make CreateCallback part of the helper and introduce CreateContext to work with it (alternative to https://github.com/facebook/rocksdb/issues/10562). This cleans up the interface while still allowing context to be provided for loading/parsing values into primary cache. This model works for async lookup in BlockBasedTable reader (reader owns a CreateContext) under the assumption that it always waits on secondary cache operations to finish. (Otherwise, the CreateContext could be destroyed while async operation depending on it continues.) This likely contributes most to the observed performance improvement because it saves an std::function backed by a heap allocation.
* Use char* for serialized data, e.g. in SaveToCallback, where void* was confusingly used. (We use `char*` for serialized byte data all over RocksDB, with many advantages over `void*`. `memcpy` etc. are legacy APIs that should not be mimicked.)
* Add a type alias Cache::ObjectPtr = void*, so that we can better indicate the intent of the void* when it is to be the object associated with a Cache entry. Related: started (but did not complete) a refactoring to move away from "value" of a cache entry toward "object" or "obj". (It is confusing to call Cache a key-value store (like DB) when it is really storing arbitrary in-memory objects, not byte strings.)
* Remove unnecessary key param from DeleterFn. This is good for efficiency in HyperClockCache, which does not directly store the cache key in memory. (Alternative to https://github.com/facebook/rocksdb/issues/10774)
* Add allocator to Cache DeleterFn. This is a kind of future-proofing change in case we get more serious about using the Cache allocator for memory tracked by the Cache. Right now, only the uncompressed block contents are allocated using the allocator, and a pointer to that allocator is saved as part of the cached object so that the deleter can use it. (See CacheAllocationPtr.) If in the future we are able to "flatten out" our Cache objects some more, it would be good not to have to track the allocator as part of each object.
* Removes legacy `ApplyToAllCacheEntries` and changes `ApplyToAllEntries` signature for Deleter->CacheItemHelper change.
## typed_cache.h
Adds various "typed" interfaces to the Cache as internal APIs, so that most uses of Cache can use simple type safe code without casting and without explicit deleters, etc. Almost all of the non-test, non-glue code uses of Cache have been migrated. (Follow-up work: CompressedSecondaryCache deserves deeper attention to migrate.) This change expands RocksDB's internal usage of metaprogramming and SFINAE (https://en.cppreference.com/w/cpp/language/sfinae).
The existing usages of Cache are divided up at a high level into these new interfaces. See updated existing uses of Cache for examples of how these are used.
* PlaceholderCacheInterface - Used for making cache reservations, with entries that have a charge but no value.
* BasicTypedCacheInterface<TValue> - Used for primary cache storage of objects of type TValue, which can be cleaned up with std::default_delete<TValue>. The role is provided by TValue::kCacheEntryRole or given in an optional template parameter.
* FullTypedCacheInterface<TValue, TCreateContext> - Used for secondary cache compatible storage of objects of type TValue. In addition to BasicTypedCacheInterface constraints, we require TValue::ContentSlice() to return persistable data. This simplifies usage for the normal case of simple secondary cache compatibility (can give you a Slice to the data already in memory). In addition to TCreateContext performing the role of Cache::CreateContext, it is also expected to provide a factory function for creating TValue.
* For each of these, there's a "Shared" version (e.g. FullTypedSharedCacheInterface) that holds a shared_ptr to the Cache, rather than assuming external ownership by holding only a raw `Cache*`.
These interfaces introduce specific handle types for each interface instantiation, so that it's easy to see what kind of object is controlled by a handle. (Ultimately, this might not be worth the extra complexity, but it seems OK so far.)
Note: I attempted to make the cache 'charge' automatically inferred from the cache object type, such as by expecting an ApproximateMemoryUsage() function, but this is not so clean because there are cases where we need to compute the charge ahead of time and don't want to re-compute it.
## block_cache.h
This header is essentially the replacement for the old block_like_traits.h. It includes various things to support block cache access with typed_cache.h for block-based table.
## block_based_table_reader.cc
Before this change, accessing the block cache here was an awkward mix of static polymorphism (template TBlocklike) and switch-case on a dynamic BlockType value. This change mostly unifies on static polymorphism, relying on minor hacks in block_cache.h to distinguish variants of Block. We still check BlockType in some places (especially for stats, which could be improved in follow-up work) but at least the BlockType is a static constant from the template parameter. (No more awkward partial redundancy between static and dynamic info.) This likely contributes to the overall performance improvement, but hasn't been tested in isolation.
The other key source of simplification here is a more unified system of creating block cache objects: for directly populating from primary cache and for promotion from secondary cache. Both use BlockCreateContext, for context and for factory functions.
## block_based_table_builder.cc, cache_dump_load_impl.cc
Before this change, warming caches was super ugly code. Both of these source files had switch statements to basically transition from the dynamic BlockType world to the static TBlocklike world. None of that mess is needed anymore as there's a new, untyped WarmInCache function that handles all the details just as promotion from SecondaryCache would. (Fixes `TODO akanksha: Dedup below code` in block_based_table_builder.cc.)
## Everything else
Mostly just updating Cache users to use new typed APIs when reasonably possible, or changed Cache APIs when not.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10975
Test Plan:
tests updated
Performance test setup similar to https://github.com/facebook/rocksdb/issues/10626 (by cache size, LRUCache when not "hyper" for HyperClockCache):
34MB 1thread base.hyper -> kops/s: 0.745 io_bytes/op: 2.52504e+06 miss_ratio: 0.140906 max_rss_mb: 76.4844
34MB 1thread new.hyper -> kops/s: 0.751 io_bytes/op: 2.5123e+06 miss_ratio: 0.140161 max_rss_mb: 79.3594
34MB 1thread base -> kops/s: 0.254 io_bytes/op: 1.36073e+07 miss_ratio: 0.918818 max_rss_mb: 45.9297
34MB 1thread new -> kops/s: 0.252 io_bytes/op: 1.36157e+07 miss_ratio: 0.918999 max_rss_mb: 44.1523
34MB 32thread base.hyper -> kops/s: 7.272 io_bytes/op: 2.88323e+06 miss_ratio: 0.162532 max_rss_mb: 516.602
34MB 32thread new.hyper -> kops/s: 7.214 io_bytes/op: 2.99046e+06 miss_ratio: 0.168818 max_rss_mb: 518.293
34MB 32thread base -> kops/s: 3.528 io_bytes/op: 1.35722e+07 miss_ratio: 0.914691 max_rss_mb: 264.926
34MB 32thread new -> kops/s: 3.604 io_bytes/op: 1.35744e+07 miss_ratio: 0.915054 max_rss_mb: 264.488
233MB 1thread base.hyper -> kops/s: 53.909 io_bytes/op: 2552.35 miss_ratio: 0.0440566 max_rss_mb: 241.984
233MB 1thread new.hyper -> kops/s: 62.792 io_bytes/op: 2549.79 miss_ratio: 0.044043 max_rss_mb: 241.922
233MB 1thread base -> kops/s: 1.197 io_bytes/op: 2.75173e+06 miss_ratio: 0.103093 max_rss_mb: 241.559
233MB 1thread new -> kops/s: 1.199 io_bytes/op: 2.73723e+06 miss_ratio: 0.10305 max_rss_mb: 240.93
233MB 32thread base.hyper -> kops/s: 1298.69 io_bytes/op: 2539.12 miss_ratio: 0.0440307 max_rss_mb: 371.418
233MB 32thread new.hyper -> kops/s: 1421.35 io_bytes/op: 2538.75 miss_ratio: 0.0440307 max_rss_mb: 347.273
233MB 32thread base -> kops/s: 9.693 io_bytes/op: 2.77304e+06 miss_ratio: 0.103745 max_rss_mb: 569.691
233MB 32thread new -> kops/s: 9.75 io_bytes/op: 2.77559e+06 miss_ratio: 0.103798 max_rss_mb: 552.82
1597MB 1thread base.hyper -> kops/s: 58.607 io_bytes/op: 1449.14 miss_ratio: 0.0249324 max_rss_mb: 1583.55
1597MB 1thread new.hyper -> kops/s: 69.6 io_bytes/op: 1434.89 miss_ratio: 0.0247167 max_rss_mb: 1584.02
1597MB 1thread base -> kops/s: 60.478 io_bytes/op: 1421.28 miss_ratio: 0.024452 max_rss_mb: 1589.45
1597MB 1thread new -> kops/s: 63.973 io_bytes/op: 1416.07 miss_ratio: 0.0243766 max_rss_mb: 1589.24
1597MB 32thread base.hyper -> kops/s: 1436.2 io_bytes/op: 1357.93 miss_ratio: 0.0235353 max_rss_mb: 1692.92
1597MB 32thread new.hyper -> kops/s: 1605.03 io_bytes/op: 1358.04 miss_ratio: 0.023538 max_rss_mb: 1702.78
1597MB 32thread base -> kops/s: 280.059 io_bytes/op: 1350.34 miss_ratio: 0.023289 max_rss_mb: 1675.36
1597MB 32thread new -> kops/s: 283.125 io_bytes/op: 1351.05 miss_ratio: 0.0232797 max_rss_mb: 1703.83
Almost uniformly improving over base revision, especially for hot paths with HyperClockCache, up to 12% higher throughput seen (1597MB, 32thread, hyper). The improvement for that is likely coming from much simplified code for providing context for secondary cache promotion (CreateCallback/CreateContext), and possibly from less branching in block_based_table_reader. And likely a small improvement from not reconstituting key for DeleterFn.
Reviewed By: anand1976
Differential Revision: D42417818
Pulled By: pdillinger
fbshipit-source-id: f86bfdd584dce27c028b151ba56818ad14f7a432
2023-01-11 22:20:40 +00:00
|
|
|
Handle* Lookup(const Slice& key, const CacheItemHelper* helper,
|
|
|
|
CreateContext* create_context,
|
HyperClockCache support for SecondaryCache, with refactoring (#11301)
Summary:
Internally refactors SecondaryCache integration out of LRUCache specifically and into a wrapper/adapter class that works with various Cache implementations. Notably, this relies on separating the notion of async lookup handles from other cache handles, so that HyperClockCache doesn't have to deal with the problem of allocating handles from the hash table for lookups that might fail anyway, and might be on the same key without support for coalescing. (LRUCache's hash table can incorporate previously allocated handles thanks to its pointer indirection.) Specifically, I'm worried about the case in which hundreds of threads try to access the same block and probing in the hash table degrades to linear search on the pile of entries with the same key.
This change is a big step in the direction of supporting stacked SecondaryCaches, but there are obstacles to completing that. Especially, there is no SecondaryCache hook for evictions to pass from one to the next. It has been proposed that evictions be transmitted simply as the persisted data (as in SaveToCallback), but given the current structure provided by the CacheItemHelpers, that would require an extra copy of the block data, because there's intentionally no way to ask for a contiguous Slice of the data (to allow for flexibility in storage). `AsyncLookupHandle` and the re-worked `WaitAll()` should be essentially prepared for stacked SecondaryCaches, but several "TODO with stacked secondaries" issues remain in various places.
It could be argued that the stacking instead be done as a SecondaryCache adapter that wraps two (or more) SecondaryCaches, but at least with the current API that would require an extra heap allocation on SecondaryCache Lookup for a wrapper SecondaryCacheResultHandle that can transfer a Lookup between secondaries. We could also consider trying to unify the Cache and SecondaryCache APIs, though that might be difficult if `AsyncLookupHandle` is kept a fixed struct.
## cache.h (public API)
Moves `secondary_cache` option from LRUCacheOptions to ShardedCacheOptions so that it is applicable to HyperClockCache.
## advanced_cache.h (advanced public API)
* Add `Cache::CreateStandalone()` so that the SecondaryCache support wrapper can use it.
* Add `SetEvictionCallback()` / `eviction_callback_` so that the SecondaryCache support wrapper can use it. Only a single callback is supported for efficiency. If there is ever a need for more than one, hopefully that can be handled with a broadcast callback wrapper.
These are essentially the two "extra" pieces of `Cache` for pulling out specific SecondaryCache support from the `Cache` implementation. I think it's a good trade-off as these are reasonable, limited, and reusable "cut points" into the `Cache` implementations.
* Remove async capability from standard `Lookup()` (getting rid of awkward restrictions on pending Handles) and add `AsyncLookupHandle` and `StartAsyncLookup()`. As noted in the comments, the full struct of `AsyncLookupHandle` is exposed so that it can be stack allocated, for efficiency, though more data is being copied around than before, which could impact performance. (Lookup info -> AsyncLookupHandle -> Handle vs. Lookup info -> Handle)
I could foresee a future in which a Cache internally saves a pointer to the AsyncLookupHandle, which means it's dangerous to allow it to be copyable or even movable. It also means it's not compatible with std::vector (which I don't like requiring as an API parameter anyway), so `WaitAll()` expects any contiguous array of AsyncLookupHandles. I believe this is best for common case efficiency, while behaving well in other cases also. For example, `WaitAll()` has no effect on default-constructed AsyncLookupHandles, which look like a completed cache miss.
## cacheable_entry.h
A couple of functions are obsolete because Cache::Handle can no longer be pending.
## cache.cc
Provides default implementations for new or revamped Cache functions, especially appropriate for non-blocking caches.
## secondary_cache_adapter.{h,cc}
The full details of the Cache wrapper adding SecondaryCache support. Essentially replicates the SecondaryCache handling that was in LRUCache, but obviously refactored. There is a bit of logic duplication, where Lookup() is essentially a manually optimized version of StartAsyncLookup() and Wait(), but it's roughly a dozen lines of code.
## sharded_cache.h, typed_cache.h, charged_cache.{h,cc}, sim_cache.cc
Simply updated for Cache API changes.
## lru_cache.{h,cc}
Carefully remove SecondaryCache logic, implement `CreateStandalone` and eviction handler functionality.
## clock_cache.{h,cc}
Expose existing `CreateStandalone` functionality, add eviction handler functionality. Light refactoring.
## block_based_table_reader*
Mostly re-worked the only usage of async Lookup, which is in BlockBasedTable::MultiGet. Used arrays in place of autovector in some places for efficiency. Simplified some logic by not trying to process some cache results before they're all ready.
Created new function `BlockBasedTable::GetCachePriority()` to reduce some pre-existing code duplication (and avoid making it worse).
Fixed at least one small bug from the prior confusing mixture of async and sync Lookups. In MaybeReadBlockAndLoadToCache(), called by RetrieveBlock(), called by MultiGet() with wait=false, is_cache_hit for the block_cache_tracer entry would not be set to true if the handle was pending after Lookup and before Wait.
## Intended follow-up work
* Figure out if there are any missing stats or block_cache_tracer work in refactored BlockBasedTable::MultiGet
* Stacked secondary caches (see above discussion)
* See if we can make up for the small MultiGet performance regression.
* Study more performance with SecondaryCache
* Items evicted from over-full LRUCache in Release were not being demoted to SecondaryCache, and still aren't to minimize unit test churn. Ideally they would be demoted, but it's an exceptional case so not a big deal.
* Use CreateStandalone for cache reservations (save unnecessary hash table operations). Not a big deal, but worthy cleanup.
* Somehow I got the contract for SecondaryCache::Insert wrong in #10945. (Doesn't take ownership!) That API comment needs to be fixed, but didn't want to mingle that in here.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11301
Test Plan:
## Unit tests
Generally updated to include HCC in SecondaryCache tests, though HyperClockCache has some different, less strict behaviors that leads to some tests not really being set up to work with it. Some of the tests remain disabled with it, but I think we have good coverage without them.
## Crash/stress test
Updated to use the new combination.
## Performance
First, let's check for regression on caches without secondary cache configured. Adding support for the eviction callback is likely to have a tiny effect, but it shouldn't be worrisome. LRUCache could benefit slightly from less logic around SecondaryCache handling. We can test with cache_bench default settings, built with DEBUG_LEVEL=0 and PORTABLE=0.
```
(while :; do base/cache_bench --cache_type=hyper_clock_cache | grep Rough; done) | awk '{ sum += $9; count++; print $0; print "Average: " int(sum / count) }'
```
**Before** this and #11299 (which could also have a small effect), running for about an hour, before & after running concurrently for each cache type:
HyperClockCache: 3168662 (average parallel ops/sec)
LRUCache: 2940127
**After** this and #11299, running for about an hour:
HyperClockCache: 3164862 (average parallel ops/sec) (0.12% slower)
LRUCache: 2940928 (0.03% faster)
This is an acceptable difference IMHO.
Next, let's consider essentially the worst case of new CPU overhead affecting overall performance. MultiGet uses the async lookup interface regardless of whether SecondaryCache or folly are used. We can configure a benchmark where all block cache queries are for data blocks, and all are hits.
Create DB and test (before and after tests running simultaneously):
```
TEST_TMPDIR=/dev/shm ./db_bench -benchmarks=fillrandom -num=30000000 -disable_wal=1 -bloom_bits=16
TEST_TMPDIR=/dev/shm base/db_bench -benchmarks=multireadrandom[-X30] -readonly -multiread_batched -batch_size=32 -num=30000000 -bloom_bits=16 -cache_size=6789000000 -duration 20 -threads=16
```
**Before**:
multireadrandom [AVG 30 runs] : 3444202 (± 57049) ops/sec; 240.9 (± 4.0) MB/sec
multireadrandom [MEDIAN 30 runs] : 3514443 ops/sec; 245.8 MB/sec
**After**:
multireadrandom [AVG 30 runs] : 3291022 (± 58851) ops/sec; 230.2 (± 4.1) MB/sec
multireadrandom [MEDIAN 30 runs] : 3366179 ops/sec; 235.4 MB/sec
So that's roughly a 3% regression, on kind of a *worst case* test of MultiGet CPU. Similar story with HyperClockCache:
**Before**:
multireadrandom [AVG 30 runs] : 3933777 (± 41840) ops/sec; 275.1 (± 2.9) MB/sec
multireadrandom [MEDIAN 30 runs] : 3970667 ops/sec; 277.7 MB/sec
**After**:
multireadrandom [AVG 30 runs] : 3755338 (± 30391) ops/sec; 262.6 (± 2.1) MB/sec
multireadrandom [MEDIAN 30 runs] : 3785696 ops/sec; 264.8 MB/sec
Roughly a 4-5% regression. Not ideal, but not the whole story, fortunately.
Let's also look at Get() in db_bench:
```
TEST_TMPDIR=/dev/shm ./db_bench -benchmarks=readrandom[-X30] -readonly -num=30000000 -bloom_bits=16 -cache_size=6789000000 -duration 20 -threads=16
```
**Before**:
readrandom [AVG 30 runs] : 2198685 (± 13412) ops/sec; 153.8 (± 0.9) MB/sec
readrandom [MEDIAN 30 runs] : 2209498 ops/sec; 154.5 MB/sec
**After**:
readrandom [AVG 30 runs] : 2292814 (± 43508) ops/sec; 160.3 (± 3.0) MB/sec
readrandom [MEDIAN 30 runs] : 2365181 ops/sec; 165.4 MB/sec
That's showing roughly a 4% improvement, perhaps because of the secondary cache code that is no longer part of LRUCache. But weirdly, HyperClockCache is also showing 2-3% improvement:
**Before**:
readrandom [AVG 30 runs] : 2272333 (± 9992) ops/sec; 158.9 (± 0.7) MB/sec
readrandom [MEDIAN 30 runs] : 2273239 ops/sec; 159.0 MB/sec
**After**:
readrandom [AVG 30 runs] : 2332407 (± 11252) ops/sec; 163.1 (± 0.8) MB/sec
readrandom [MEDIAN 30 runs] : 2335329 ops/sec; 163.3 MB/sec
Reviewed By: ltamasi
Differential Revision: D44177044
Pulled By: pdillinger
fbshipit-source-id: e808e48ff3fe2f792a79841ba617be98e48689f5
2023-03-18 03:23:49 +00:00
|
|
|
Priority priority = Priority::LOW,
|
Major Cache refactoring, CPU efficiency improvement (#10975)
Summary:
This is several refactorings bundled into one to avoid having to incrementally re-modify uses of Cache several times. Overall, there are breaking changes to Cache class, and it becomes more of low-level interface for implementing caches, especially block cache. New internal APIs make using Cache cleaner than before, and more insulated from block cache evolution. Hopefully, this is the last really big block cache refactoring, because of rather effectively decoupling the implementations from the uses. This change also removes the EXPERIMENTAL designation on the SecondaryCache support in Cache. It seems reasonably mature at this point but still subject to change/evolution (as I warn in the API docs for Cache).
The high-level motivation for this refactoring is to minimize code duplication / compounding complexity in adding SecondaryCache support to HyperClockCache (in a later PR). Other benefits listed below.
* static_cast lines of code +29 -35 (net removed 6)
* reinterpret_cast lines of code +6 -32 (net removed 26)
## cache.h and secondary_cache.h
* Always use CacheItemHelper with entries instead of just a Deleter. There are several motivations / justifications:
* Simpler for implementations to deal with just one Insert and one Lookup.
* Simpler and more efficient implementation because we don't have to track which entries are using helpers and which are using deleters
* Gets rid of hack to classify cache entries by their deleter. Instead, the CacheItemHelper includes a CacheEntryRole. This simplifies a lot of code (cache_entry_roles.h almost eliminated). Fixes https://github.com/facebook/rocksdb/issues/9428.
* Makes it trivial to adjust SecondaryCache behavior based on kind of block (e.g. don't re-compress filter blocks).
* It is arguably less convenient for many direct users of Cache, but direct users of Cache are now rare with introduction of typed_cache.h (below).
* I considered and rejected an alternative approach in which we reduce customizability by assuming each secondary cache compatible value starts with a Slice referencing the uncompressed block contents (already true or mostly true), but we apparently intend to stack secondary caches. Saving an entry from a compressed secondary to a lower tier requires custom handling offered by SaveToCallback, etc.
* Make CreateCallback part of the helper and introduce CreateContext to work with it (alternative to https://github.com/facebook/rocksdb/issues/10562). This cleans up the interface while still allowing context to be provided for loading/parsing values into primary cache. This model works for async lookup in BlockBasedTable reader (reader owns a CreateContext) under the assumption that it always waits on secondary cache operations to finish. (Otherwise, the CreateContext could be destroyed while async operation depending on it continues.) This likely contributes most to the observed performance improvement because it saves an std::function backed by a heap allocation.
* Use char* for serialized data, e.g. in SaveToCallback, where void* was confusingly used. (We use `char*` for serialized byte data all over RocksDB, with many advantages over `void*`. `memcpy` etc. are legacy APIs that should not be mimicked.)
* Add a type alias Cache::ObjectPtr = void*, so that we can better indicate the intent of the void* when it is to be the object associated with a Cache entry. Related: started (but did not complete) a refactoring to move away from "value" of a cache entry toward "object" or "obj". (It is confusing to call Cache a key-value store (like DB) when it is really storing arbitrary in-memory objects, not byte strings.)
* Remove unnecessary key param from DeleterFn. This is good for efficiency in HyperClockCache, which does not directly store the cache key in memory. (Alternative to https://github.com/facebook/rocksdb/issues/10774)
* Add allocator to Cache DeleterFn. This is a kind of future-proofing change in case we get more serious about using the Cache allocator for memory tracked by the Cache. Right now, only the uncompressed block contents are allocated using the allocator, and a pointer to that allocator is saved as part of the cached object so that the deleter can use it. (See CacheAllocationPtr.) If in the future we are able to "flatten out" our Cache objects some more, it would be good not to have to track the allocator as part of each object.
* Removes legacy `ApplyToAllCacheEntries` and changes `ApplyToAllEntries` signature for Deleter->CacheItemHelper change.
## typed_cache.h
Adds various "typed" interfaces to the Cache as internal APIs, so that most uses of Cache can use simple type safe code without casting and without explicit deleters, etc. Almost all of the non-test, non-glue code uses of Cache have been migrated. (Follow-up work: CompressedSecondaryCache deserves deeper attention to migrate.) This change expands RocksDB's internal usage of metaprogramming and SFINAE (https://en.cppreference.com/w/cpp/language/sfinae).
The existing usages of Cache are divided up at a high level into these new interfaces. See updated existing uses of Cache for examples of how these are used.
* PlaceholderCacheInterface - Used for making cache reservations, with entries that have a charge but no value.
* BasicTypedCacheInterface<TValue> - Used for primary cache storage of objects of type TValue, which can be cleaned up with std::default_delete<TValue>. The role is provided by TValue::kCacheEntryRole or given in an optional template parameter.
* FullTypedCacheInterface<TValue, TCreateContext> - Used for secondary cache compatible storage of objects of type TValue. In addition to BasicTypedCacheInterface constraints, we require TValue::ContentSlice() to return persistable data. This simplifies usage for the normal case of simple secondary cache compatibility (can give you a Slice to the data already in memory). In addition to TCreateContext performing the role of Cache::CreateContext, it is also expected to provide a factory function for creating TValue.
* For each of these, there's a "Shared" version (e.g. FullTypedSharedCacheInterface) that holds a shared_ptr to the Cache, rather than assuming external ownership by holding only a raw `Cache*`.
These interfaces introduce specific handle types for each interface instantiation, so that it's easy to see what kind of object is controlled by a handle. (Ultimately, this might not be worth the extra complexity, but it seems OK so far.)
Note: I attempted to make the cache 'charge' automatically inferred from the cache object type, such as by expecting an ApproximateMemoryUsage() function, but this is not so clean because there are cases where we need to compute the charge ahead of time and don't want to re-compute it.
## block_cache.h
This header is essentially the replacement for the old block_like_traits.h. It includes various things to support block cache access with typed_cache.h for block-based table.
## block_based_table_reader.cc
Before this change, accessing the block cache here was an awkward mix of static polymorphism (template TBlocklike) and switch-case on a dynamic BlockType value. This change mostly unifies on static polymorphism, relying on minor hacks in block_cache.h to distinguish variants of Block. We still check BlockType in some places (especially for stats, which could be improved in follow-up work) but at least the BlockType is a static constant from the template parameter. (No more awkward partial redundancy between static and dynamic info.) This likely contributes to the overall performance improvement, but hasn't been tested in isolation.
The other key source of simplification here is a more unified system of creating block cache objects: for directly populating from primary cache and for promotion from secondary cache. Both use BlockCreateContext, for context and for factory functions.
## block_based_table_builder.cc, cache_dump_load_impl.cc
Before this change, warming caches was super ugly code. Both of these source files had switch statements to basically transition from the dynamic BlockType world to the static TBlocklike world. None of that mess is needed anymore as there's a new, untyped WarmInCache function that handles all the details just as promotion from SecondaryCache would. (Fixes `TODO akanksha: Dedup below code` in block_based_table_builder.cc.)
## Everything else
Mostly just updating Cache users to use new typed APIs when reasonably possible, or changed Cache APIs when not.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10975
Test Plan:
tests updated
Performance test setup similar to https://github.com/facebook/rocksdb/issues/10626 (by cache size, LRUCache when not "hyper" for HyperClockCache):
34MB 1thread base.hyper -> kops/s: 0.745 io_bytes/op: 2.52504e+06 miss_ratio: 0.140906 max_rss_mb: 76.4844
34MB 1thread new.hyper -> kops/s: 0.751 io_bytes/op: 2.5123e+06 miss_ratio: 0.140161 max_rss_mb: 79.3594
34MB 1thread base -> kops/s: 0.254 io_bytes/op: 1.36073e+07 miss_ratio: 0.918818 max_rss_mb: 45.9297
34MB 1thread new -> kops/s: 0.252 io_bytes/op: 1.36157e+07 miss_ratio: 0.918999 max_rss_mb: 44.1523
34MB 32thread base.hyper -> kops/s: 7.272 io_bytes/op: 2.88323e+06 miss_ratio: 0.162532 max_rss_mb: 516.602
34MB 32thread new.hyper -> kops/s: 7.214 io_bytes/op: 2.99046e+06 miss_ratio: 0.168818 max_rss_mb: 518.293
34MB 32thread base -> kops/s: 3.528 io_bytes/op: 1.35722e+07 miss_ratio: 0.914691 max_rss_mb: 264.926
34MB 32thread new -> kops/s: 3.604 io_bytes/op: 1.35744e+07 miss_ratio: 0.915054 max_rss_mb: 264.488
233MB 1thread base.hyper -> kops/s: 53.909 io_bytes/op: 2552.35 miss_ratio: 0.0440566 max_rss_mb: 241.984
233MB 1thread new.hyper -> kops/s: 62.792 io_bytes/op: 2549.79 miss_ratio: 0.044043 max_rss_mb: 241.922
233MB 1thread base -> kops/s: 1.197 io_bytes/op: 2.75173e+06 miss_ratio: 0.103093 max_rss_mb: 241.559
233MB 1thread new -> kops/s: 1.199 io_bytes/op: 2.73723e+06 miss_ratio: 0.10305 max_rss_mb: 240.93
233MB 32thread base.hyper -> kops/s: 1298.69 io_bytes/op: 2539.12 miss_ratio: 0.0440307 max_rss_mb: 371.418
233MB 32thread new.hyper -> kops/s: 1421.35 io_bytes/op: 2538.75 miss_ratio: 0.0440307 max_rss_mb: 347.273
233MB 32thread base -> kops/s: 9.693 io_bytes/op: 2.77304e+06 miss_ratio: 0.103745 max_rss_mb: 569.691
233MB 32thread new -> kops/s: 9.75 io_bytes/op: 2.77559e+06 miss_ratio: 0.103798 max_rss_mb: 552.82
1597MB 1thread base.hyper -> kops/s: 58.607 io_bytes/op: 1449.14 miss_ratio: 0.0249324 max_rss_mb: 1583.55
1597MB 1thread new.hyper -> kops/s: 69.6 io_bytes/op: 1434.89 miss_ratio: 0.0247167 max_rss_mb: 1584.02
1597MB 1thread base -> kops/s: 60.478 io_bytes/op: 1421.28 miss_ratio: 0.024452 max_rss_mb: 1589.45
1597MB 1thread new -> kops/s: 63.973 io_bytes/op: 1416.07 miss_ratio: 0.0243766 max_rss_mb: 1589.24
1597MB 32thread base.hyper -> kops/s: 1436.2 io_bytes/op: 1357.93 miss_ratio: 0.0235353 max_rss_mb: 1692.92
1597MB 32thread new.hyper -> kops/s: 1605.03 io_bytes/op: 1358.04 miss_ratio: 0.023538 max_rss_mb: 1702.78
1597MB 32thread base -> kops/s: 280.059 io_bytes/op: 1350.34 miss_ratio: 0.023289 max_rss_mb: 1675.36
1597MB 32thread new -> kops/s: 283.125 io_bytes/op: 1351.05 miss_ratio: 0.0232797 max_rss_mb: 1703.83
Almost uniformly improving over base revision, especially for hot paths with HyperClockCache, up to 12% higher throughput seen (1597MB, 32thread, hyper). The improvement for that is likely coming from much simplified code for providing context for secondary cache promotion (CreateCallback/CreateContext), and possibly from less branching in block_based_table_reader. And likely a small improvement from not reconstituting key for DeleterFn.
Reviewed By: anand1976
Differential Revision: D42417818
Pulled By: pdillinger
fbshipit-source-id: f86bfdd584dce27c028b151ba56818ad14f7a432
2023-01-11 22:20:40 +00:00
|
|
|
Statistics* stats = nullptr) override {
|
2019-07-01 03:52:34 +00:00
|
|
|
num_lookups_++;
|
Major Cache refactoring, CPU efficiency improvement (#10975)
Summary:
This is several refactorings bundled into one to avoid having to incrementally re-modify uses of Cache several times. Overall, there are breaking changes to Cache class, and it becomes more of low-level interface for implementing caches, especially block cache. New internal APIs make using Cache cleaner than before, and more insulated from block cache evolution. Hopefully, this is the last really big block cache refactoring, because of rather effectively decoupling the implementations from the uses. This change also removes the EXPERIMENTAL designation on the SecondaryCache support in Cache. It seems reasonably mature at this point but still subject to change/evolution (as I warn in the API docs for Cache).
The high-level motivation for this refactoring is to minimize code duplication / compounding complexity in adding SecondaryCache support to HyperClockCache (in a later PR). Other benefits listed below.
* static_cast lines of code +29 -35 (net removed 6)
* reinterpret_cast lines of code +6 -32 (net removed 26)
## cache.h and secondary_cache.h
* Always use CacheItemHelper with entries instead of just a Deleter. There are several motivations / justifications:
* Simpler for implementations to deal with just one Insert and one Lookup.
* Simpler and more efficient implementation because we don't have to track which entries are using helpers and which are using deleters
* Gets rid of hack to classify cache entries by their deleter. Instead, the CacheItemHelper includes a CacheEntryRole. This simplifies a lot of code (cache_entry_roles.h almost eliminated). Fixes https://github.com/facebook/rocksdb/issues/9428.
* Makes it trivial to adjust SecondaryCache behavior based on kind of block (e.g. don't re-compress filter blocks).
* It is arguably less convenient for many direct users of Cache, but direct users of Cache are now rare with introduction of typed_cache.h (below).
* I considered and rejected an alternative approach in which we reduce customizability by assuming each secondary cache compatible value starts with a Slice referencing the uncompressed block contents (already true or mostly true), but we apparently intend to stack secondary caches. Saving an entry from a compressed secondary to a lower tier requires custom handling offered by SaveToCallback, etc.
* Make CreateCallback part of the helper and introduce CreateContext to work with it (alternative to https://github.com/facebook/rocksdb/issues/10562). This cleans up the interface while still allowing context to be provided for loading/parsing values into primary cache. This model works for async lookup in BlockBasedTable reader (reader owns a CreateContext) under the assumption that it always waits on secondary cache operations to finish. (Otherwise, the CreateContext could be destroyed while async operation depending on it continues.) This likely contributes most to the observed performance improvement because it saves an std::function backed by a heap allocation.
* Use char* for serialized data, e.g. in SaveToCallback, where void* was confusingly used. (We use `char*` for serialized byte data all over RocksDB, with many advantages over `void*`. `memcpy` etc. are legacy APIs that should not be mimicked.)
* Add a type alias Cache::ObjectPtr = void*, so that we can better indicate the intent of the void* when it is to be the object associated with a Cache entry. Related: started (but did not complete) a refactoring to move away from "value" of a cache entry toward "object" or "obj". (It is confusing to call Cache a key-value store (like DB) when it is really storing arbitrary in-memory objects, not byte strings.)
* Remove unnecessary key param from DeleterFn. This is good for efficiency in HyperClockCache, which does not directly store the cache key in memory. (Alternative to https://github.com/facebook/rocksdb/issues/10774)
* Add allocator to Cache DeleterFn. This is a kind of future-proofing change in case we get more serious about using the Cache allocator for memory tracked by the Cache. Right now, only the uncompressed block contents are allocated using the allocator, and a pointer to that allocator is saved as part of the cached object so that the deleter can use it. (See CacheAllocationPtr.) If in the future we are able to "flatten out" our Cache objects some more, it would be good not to have to track the allocator as part of each object.
* Removes legacy `ApplyToAllCacheEntries` and changes `ApplyToAllEntries` signature for Deleter->CacheItemHelper change.
## typed_cache.h
Adds various "typed" interfaces to the Cache as internal APIs, so that most uses of Cache can use simple type safe code without casting and without explicit deleters, etc. Almost all of the non-test, non-glue code uses of Cache have been migrated. (Follow-up work: CompressedSecondaryCache deserves deeper attention to migrate.) This change expands RocksDB's internal usage of metaprogramming and SFINAE (https://en.cppreference.com/w/cpp/language/sfinae).
The existing usages of Cache are divided up at a high level into these new interfaces. See updated existing uses of Cache for examples of how these are used.
* PlaceholderCacheInterface - Used for making cache reservations, with entries that have a charge but no value.
* BasicTypedCacheInterface<TValue> - Used for primary cache storage of objects of type TValue, which can be cleaned up with std::default_delete<TValue>. The role is provided by TValue::kCacheEntryRole or given in an optional template parameter.
* FullTypedCacheInterface<TValue, TCreateContext> - Used for secondary cache compatible storage of objects of type TValue. In addition to BasicTypedCacheInterface constraints, we require TValue::ContentSlice() to return persistable data. This simplifies usage for the normal case of simple secondary cache compatibility (can give you a Slice to the data already in memory). In addition to TCreateContext performing the role of Cache::CreateContext, it is also expected to provide a factory function for creating TValue.
* For each of these, there's a "Shared" version (e.g. FullTypedSharedCacheInterface) that holds a shared_ptr to the Cache, rather than assuming external ownership by holding only a raw `Cache*`.
These interfaces introduce specific handle types for each interface instantiation, so that it's easy to see what kind of object is controlled by a handle. (Ultimately, this might not be worth the extra complexity, but it seems OK so far.)
Note: I attempted to make the cache 'charge' automatically inferred from the cache object type, such as by expecting an ApproximateMemoryUsage() function, but this is not so clean because there are cases where we need to compute the charge ahead of time and don't want to re-compute it.
## block_cache.h
This header is essentially the replacement for the old block_like_traits.h. It includes various things to support block cache access with typed_cache.h for block-based table.
## block_based_table_reader.cc
Before this change, accessing the block cache here was an awkward mix of static polymorphism (template TBlocklike) and switch-case on a dynamic BlockType value. This change mostly unifies on static polymorphism, relying on minor hacks in block_cache.h to distinguish variants of Block. We still check BlockType in some places (especially for stats, which could be improved in follow-up work) but at least the BlockType is a static constant from the template parameter. (No more awkward partial redundancy between static and dynamic info.) This likely contributes to the overall performance improvement, but hasn't been tested in isolation.
The other key source of simplification here is a more unified system of creating block cache objects: for directly populating from primary cache and for promotion from secondary cache. Both use BlockCreateContext, for context and for factory functions.
## block_based_table_builder.cc, cache_dump_load_impl.cc
Before this change, warming caches was super ugly code. Both of these source files had switch statements to basically transition from the dynamic BlockType world to the static TBlocklike world. None of that mess is needed anymore as there's a new, untyped WarmInCache function that handles all the details just as promotion from SecondaryCache would. (Fixes `TODO akanksha: Dedup below code` in block_based_table_builder.cc.)
## Everything else
Mostly just updating Cache users to use new typed APIs when reasonably possible, or changed Cache APIs when not.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10975
Test Plan:
tests updated
Performance test setup similar to https://github.com/facebook/rocksdb/issues/10626 (by cache size, LRUCache when not "hyper" for HyperClockCache):
34MB 1thread base.hyper -> kops/s: 0.745 io_bytes/op: 2.52504e+06 miss_ratio: 0.140906 max_rss_mb: 76.4844
34MB 1thread new.hyper -> kops/s: 0.751 io_bytes/op: 2.5123e+06 miss_ratio: 0.140161 max_rss_mb: 79.3594
34MB 1thread base -> kops/s: 0.254 io_bytes/op: 1.36073e+07 miss_ratio: 0.918818 max_rss_mb: 45.9297
34MB 1thread new -> kops/s: 0.252 io_bytes/op: 1.36157e+07 miss_ratio: 0.918999 max_rss_mb: 44.1523
34MB 32thread base.hyper -> kops/s: 7.272 io_bytes/op: 2.88323e+06 miss_ratio: 0.162532 max_rss_mb: 516.602
34MB 32thread new.hyper -> kops/s: 7.214 io_bytes/op: 2.99046e+06 miss_ratio: 0.168818 max_rss_mb: 518.293
34MB 32thread base -> kops/s: 3.528 io_bytes/op: 1.35722e+07 miss_ratio: 0.914691 max_rss_mb: 264.926
34MB 32thread new -> kops/s: 3.604 io_bytes/op: 1.35744e+07 miss_ratio: 0.915054 max_rss_mb: 264.488
233MB 1thread base.hyper -> kops/s: 53.909 io_bytes/op: 2552.35 miss_ratio: 0.0440566 max_rss_mb: 241.984
233MB 1thread new.hyper -> kops/s: 62.792 io_bytes/op: 2549.79 miss_ratio: 0.044043 max_rss_mb: 241.922
233MB 1thread base -> kops/s: 1.197 io_bytes/op: 2.75173e+06 miss_ratio: 0.103093 max_rss_mb: 241.559
233MB 1thread new -> kops/s: 1.199 io_bytes/op: 2.73723e+06 miss_ratio: 0.10305 max_rss_mb: 240.93
233MB 32thread base.hyper -> kops/s: 1298.69 io_bytes/op: 2539.12 miss_ratio: 0.0440307 max_rss_mb: 371.418
233MB 32thread new.hyper -> kops/s: 1421.35 io_bytes/op: 2538.75 miss_ratio: 0.0440307 max_rss_mb: 347.273
233MB 32thread base -> kops/s: 9.693 io_bytes/op: 2.77304e+06 miss_ratio: 0.103745 max_rss_mb: 569.691
233MB 32thread new -> kops/s: 9.75 io_bytes/op: 2.77559e+06 miss_ratio: 0.103798 max_rss_mb: 552.82
1597MB 1thread base.hyper -> kops/s: 58.607 io_bytes/op: 1449.14 miss_ratio: 0.0249324 max_rss_mb: 1583.55
1597MB 1thread new.hyper -> kops/s: 69.6 io_bytes/op: 1434.89 miss_ratio: 0.0247167 max_rss_mb: 1584.02
1597MB 1thread base -> kops/s: 60.478 io_bytes/op: 1421.28 miss_ratio: 0.024452 max_rss_mb: 1589.45
1597MB 1thread new -> kops/s: 63.973 io_bytes/op: 1416.07 miss_ratio: 0.0243766 max_rss_mb: 1589.24
1597MB 32thread base.hyper -> kops/s: 1436.2 io_bytes/op: 1357.93 miss_ratio: 0.0235353 max_rss_mb: 1692.92
1597MB 32thread new.hyper -> kops/s: 1605.03 io_bytes/op: 1358.04 miss_ratio: 0.023538 max_rss_mb: 1702.78
1597MB 32thread base -> kops/s: 280.059 io_bytes/op: 1350.34 miss_ratio: 0.023289 max_rss_mb: 1675.36
1597MB 32thread new -> kops/s: 283.125 io_bytes/op: 1351.05 miss_ratio: 0.0232797 max_rss_mb: 1703.83
Almost uniformly improving over base revision, especially for hot paths with HyperClockCache, up to 12% higher throughput seen (1597MB, 32thread, hyper). The improvement for that is likely coming from much simplified code for providing context for secondary cache promotion (CreateCallback/CreateContext), and possibly from less branching in block_based_table_reader. And likely a small improvement from not reconstituting key for DeleterFn.
Reviewed By: anand1976
Differential Revision: D42417818
Pulled By: pdillinger
fbshipit-source-id: f86bfdd584dce27c028b151ba56818ad14f7a432
2023-01-11 22:20:40 +00:00
|
|
|
Handle* handle =
|
HyperClockCache support for SecondaryCache, with refactoring (#11301)
Summary:
Internally refactors SecondaryCache integration out of LRUCache specifically and into a wrapper/adapter class that works with various Cache implementations. Notably, this relies on separating the notion of async lookup handles from other cache handles, so that HyperClockCache doesn't have to deal with the problem of allocating handles from the hash table for lookups that might fail anyway, and might be on the same key without support for coalescing. (LRUCache's hash table can incorporate previously allocated handles thanks to its pointer indirection.) Specifically, I'm worried about the case in which hundreds of threads try to access the same block and probing in the hash table degrades to linear search on the pile of entries with the same key.
This change is a big step in the direction of supporting stacked SecondaryCaches, but there are obstacles to completing that. Especially, there is no SecondaryCache hook for evictions to pass from one to the next. It has been proposed that evictions be transmitted simply as the persisted data (as in SaveToCallback), but given the current structure provided by the CacheItemHelpers, that would require an extra copy of the block data, because there's intentionally no way to ask for a contiguous Slice of the data (to allow for flexibility in storage). `AsyncLookupHandle` and the re-worked `WaitAll()` should be essentially prepared for stacked SecondaryCaches, but several "TODO with stacked secondaries" issues remain in various places.
It could be argued that the stacking instead be done as a SecondaryCache adapter that wraps two (or more) SecondaryCaches, but at least with the current API that would require an extra heap allocation on SecondaryCache Lookup for a wrapper SecondaryCacheResultHandle that can transfer a Lookup between secondaries. We could also consider trying to unify the Cache and SecondaryCache APIs, though that might be difficult if `AsyncLookupHandle` is kept a fixed struct.
## cache.h (public API)
Moves `secondary_cache` option from LRUCacheOptions to ShardedCacheOptions so that it is applicable to HyperClockCache.
## advanced_cache.h (advanced public API)
* Add `Cache::CreateStandalone()` so that the SecondaryCache support wrapper can use it.
* Add `SetEvictionCallback()` / `eviction_callback_` so that the SecondaryCache support wrapper can use it. Only a single callback is supported for efficiency. If there is ever a need for more than one, hopefully that can be handled with a broadcast callback wrapper.
These are essentially the two "extra" pieces of `Cache` for pulling out specific SecondaryCache support from the `Cache` implementation. I think it's a good trade-off as these are reasonable, limited, and reusable "cut points" into the `Cache` implementations.
* Remove async capability from standard `Lookup()` (getting rid of awkward restrictions on pending Handles) and add `AsyncLookupHandle` and `StartAsyncLookup()`. As noted in the comments, the full struct of `AsyncLookupHandle` is exposed so that it can be stack allocated, for efficiency, though more data is being copied around than before, which could impact performance. (Lookup info -> AsyncLookupHandle -> Handle vs. Lookup info -> Handle)
I could foresee a future in which a Cache internally saves a pointer to the AsyncLookupHandle, which means it's dangerous to allow it to be copyable or even movable. It also means it's not compatible with std::vector (which I don't like requiring as an API parameter anyway), so `WaitAll()` expects any contiguous array of AsyncLookupHandles. I believe this is best for common case efficiency, while behaving well in other cases also. For example, `WaitAll()` has no effect on default-constructed AsyncLookupHandles, which look like a completed cache miss.
## cacheable_entry.h
A couple of functions are obsolete because Cache::Handle can no longer be pending.
## cache.cc
Provides default implementations for new or revamped Cache functions, especially appropriate for non-blocking caches.
## secondary_cache_adapter.{h,cc}
The full details of the Cache wrapper adding SecondaryCache support. Essentially replicates the SecondaryCache handling that was in LRUCache, but obviously refactored. There is a bit of logic duplication, where Lookup() is essentially a manually optimized version of StartAsyncLookup() and Wait(), but it's roughly a dozen lines of code.
## sharded_cache.h, typed_cache.h, charged_cache.{h,cc}, sim_cache.cc
Simply updated for Cache API changes.
## lru_cache.{h,cc}
Carefully remove SecondaryCache logic, implement `CreateStandalone` and eviction handler functionality.
## clock_cache.{h,cc}
Expose existing `CreateStandalone` functionality, add eviction handler functionality. Light refactoring.
## block_based_table_reader*
Mostly re-worked the only usage of async Lookup, which is in BlockBasedTable::MultiGet. Used arrays in place of autovector in some places for efficiency. Simplified some logic by not trying to process some cache results before they're all ready.
Created new function `BlockBasedTable::GetCachePriority()` to reduce some pre-existing code duplication (and avoid making it worse).
Fixed at least one small bug from the prior confusing mixture of async and sync Lookups. In MaybeReadBlockAndLoadToCache(), called by RetrieveBlock(), called by MultiGet() with wait=false, is_cache_hit for the block_cache_tracer entry would not be set to true if the handle was pending after Lookup and before Wait.
## Intended follow-up work
* Figure out if there are any missing stats or block_cache_tracer work in refactored BlockBasedTable::MultiGet
* Stacked secondary caches (see above discussion)
* See if we can make up for the small MultiGet performance regression.
* Study more performance with SecondaryCache
* Items evicted from over-full LRUCache in Release were not being demoted to SecondaryCache, and still aren't to minimize unit test churn. Ideally they would be demoted, but it's an exceptional case so not a big deal.
* Use CreateStandalone for cache reservations (save unnecessary hash table operations). Not a big deal, but worthy cleanup.
* Somehow I got the contract for SecondaryCache::Insert wrong in #10945. (Doesn't take ownership!) That API comment needs to be fixed, but didn't want to mingle that in here.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11301
Test Plan:
## Unit tests
Generally updated to include HCC in SecondaryCache tests, though HyperClockCache has some different, less strict behaviors that leads to some tests not really being set up to work with it. Some of the tests remain disabled with it, but I think we have good coverage without them.
## Crash/stress test
Updated to use the new combination.
## Performance
First, let's check for regression on caches without secondary cache configured. Adding support for the eviction callback is likely to have a tiny effect, but it shouldn't be worrisome. LRUCache could benefit slightly from less logic around SecondaryCache handling. We can test with cache_bench default settings, built with DEBUG_LEVEL=0 and PORTABLE=0.
```
(while :; do base/cache_bench --cache_type=hyper_clock_cache | grep Rough; done) | awk '{ sum += $9; count++; print $0; print "Average: " int(sum / count) }'
```
**Before** this and #11299 (which could also have a small effect), running for about an hour, before & after running concurrently for each cache type:
HyperClockCache: 3168662 (average parallel ops/sec)
LRUCache: 2940127
**After** this and #11299, running for about an hour:
HyperClockCache: 3164862 (average parallel ops/sec) (0.12% slower)
LRUCache: 2940928 (0.03% faster)
This is an acceptable difference IMHO.
Next, let's consider essentially the worst case of new CPU overhead affecting overall performance. MultiGet uses the async lookup interface regardless of whether SecondaryCache or folly are used. We can configure a benchmark where all block cache queries are for data blocks, and all are hits.
Create DB and test (before and after tests running simultaneously):
```
TEST_TMPDIR=/dev/shm ./db_bench -benchmarks=fillrandom -num=30000000 -disable_wal=1 -bloom_bits=16
TEST_TMPDIR=/dev/shm base/db_bench -benchmarks=multireadrandom[-X30] -readonly -multiread_batched -batch_size=32 -num=30000000 -bloom_bits=16 -cache_size=6789000000 -duration 20 -threads=16
```
**Before**:
multireadrandom [AVG 30 runs] : 3444202 (± 57049) ops/sec; 240.9 (± 4.0) MB/sec
multireadrandom [MEDIAN 30 runs] : 3514443 ops/sec; 245.8 MB/sec
**After**:
multireadrandom [AVG 30 runs] : 3291022 (± 58851) ops/sec; 230.2 (± 4.1) MB/sec
multireadrandom [MEDIAN 30 runs] : 3366179 ops/sec; 235.4 MB/sec
So that's roughly a 3% regression, on kind of a *worst case* test of MultiGet CPU. Similar story with HyperClockCache:
**Before**:
multireadrandom [AVG 30 runs] : 3933777 (± 41840) ops/sec; 275.1 (± 2.9) MB/sec
multireadrandom [MEDIAN 30 runs] : 3970667 ops/sec; 277.7 MB/sec
**After**:
multireadrandom [AVG 30 runs] : 3755338 (± 30391) ops/sec; 262.6 (± 2.1) MB/sec
multireadrandom [MEDIAN 30 runs] : 3785696 ops/sec; 264.8 MB/sec
Roughly a 4-5% regression. Not ideal, but not the whole story, fortunately.
Let's also look at Get() in db_bench:
```
TEST_TMPDIR=/dev/shm ./db_bench -benchmarks=readrandom[-X30] -readonly -num=30000000 -bloom_bits=16 -cache_size=6789000000 -duration 20 -threads=16
```
**Before**:
readrandom [AVG 30 runs] : 2198685 (± 13412) ops/sec; 153.8 (± 0.9) MB/sec
readrandom [MEDIAN 30 runs] : 2209498 ops/sec; 154.5 MB/sec
**After**:
readrandom [AVG 30 runs] : 2292814 (± 43508) ops/sec; 160.3 (± 3.0) MB/sec
readrandom [MEDIAN 30 runs] : 2365181 ops/sec; 165.4 MB/sec
That's showing roughly a 4% improvement, perhaps because of the secondary cache code that is no longer part of LRUCache. But weirdly, HyperClockCache is also showing 2-3% improvement:
**Before**:
readrandom [AVG 30 runs] : 2272333 (± 9992) ops/sec; 158.9 (± 0.7) MB/sec
readrandom [MEDIAN 30 runs] : 2273239 ops/sec; 159.0 MB/sec
**After**:
readrandom [AVG 30 runs] : 2332407 (± 11252) ops/sec; 163.1 (± 0.8) MB/sec
readrandom [MEDIAN 30 runs] : 2335329 ops/sec; 163.3 MB/sec
Reviewed By: ltamasi
Differential Revision: D44177044
Pulled By: pdillinger
fbshipit-source-id: e808e48ff3fe2f792a79841ba617be98e48689f5
2023-03-18 03:23:49 +00:00
|
|
|
target_->Lookup(key, helper, create_context, priority, stats);
|
2019-07-01 03:52:34 +00:00
|
|
|
if (handle != nullptr) {
|
|
|
|
num_found_++;
|
|
|
|
}
|
|
|
|
return handle;
|
|
|
|
}
|
Major Cache refactoring, CPU efficiency improvement (#10975)
Summary:
This is several refactorings bundled into one to avoid having to incrementally re-modify uses of Cache several times. Overall, there are breaking changes to Cache class, and it becomes more of low-level interface for implementing caches, especially block cache. New internal APIs make using Cache cleaner than before, and more insulated from block cache evolution. Hopefully, this is the last really big block cache refactoring, because of rather effectively decoupling the implementations from the uses. This change also removes the EXPERIMENTAL designation on the SecondaryCache support in Cache. It seems reasonably mature at this point but still subject to change/evolution (as I warn in the API docs for Cache).
The high-level motivation for this refactoring is to minimize code duplication / compounding complexity in adding SecondaryCache support to HyperClockCache (in a later PR). Other benefits listed below.
* static_cast lines of code +29 -35 (net removed 6)
* reinterpret_cast lines of code +6 -32 (net removed 26)
## cache.h and secondary_cache.h
* Always use CacheItemHelper with entries instead of just a Deleter. There are several motivations / justifications:
* Simpler for implementations to deal with just one Insert and one Lookup.
* Simpler and more efficient implementation because we don't have to track which entries are using helpers and which are using deleters
* Gets rid of hack to classify cache entries by their deleter. Instead, the CacheItemHelper includes a CacheEntryRole. This simplifies a lot of code (cache_entry_roles.h almost eliminated). Fixes https://github.com/facebook/rocksdb/issues/9428.
* Makes it trivial to adjust SecondaryCache behavior based on kind of block (e.g. don't re-compress filter blocks).
* It is arguably less convenient for many direct users of Cache, but direct users of Cache are now rare with introduction of typed_cache.h (below).
* I considered and rejected an alternative approach in which we reduce customizability by assuming each secondary cache compatible value starts with a Slice referencing the uncompressed block contents (already true or mostly true), but we apparently intend to stack secondary caches. Saving an entry from a compressed secondary to a lower tier requires custom handling offered by SaveToCallback, etc.
* Make CreateCallback part of the helper and introduce CreateContext to work with it (alternative to https://github.com/facebook/rocksdb/issues/10562). This cleans up the interface while still allowing context to be provided for loading/parsing values into primary cache. This model works for async lookup in BlockBasedTable reader (reader owns a CreateContext) under the assumption that it always waits on secondary cache operations to finish. (Otherwise, the CreateContext could be destroyed while async operation depending on it continues.) This likely contributes most to the observed performance improvement because it saves an std::function backed by a heap allocation.
* Use char* for serialized data, e.g. in SaveToCallback, where void* was confusingly used. (We use `char*` for serialized byte data all over RocksDB, with many advantages over `void*`. `memcpy` etc. are legacy APIs that should not be mimicked.)
* Add a type alias Cache::ObjectPtr = void*, so that we can better indicate the intent of the void* when it is to be the object associated with a Cache entry. Related: started (but did not complete) a refactoring to move away from "value" of a cache entry toward "object" or "obj". (It is confusing to call Cache a key-value store (like DB) when it is really storing arbitrary in-memory objects, not byte strings.)
* Remove unnecessary key param from DeleterFn. This is good for efficiency in HyperClockCache, which does not directly store the cache key in memory. (Alternative to https://github.com/facebook/rocksdb/issues/10774)
* Add allocator to Cache DeleterFn. This is a kind of future-proofing change in case we get more serious about using the Cache allocator for memory tracked by the Cache. Right now, only the uncompressed block contents are allocated using the allocator, and a pointer to that allocator is saved as part of the cached object so that the deleter can use it. (See CacheAllocationPtr.) If in the future we are able to "flatten out" our Cache objects some more, it would be good not to have to track the allocator as part of each object.
* Removes legacy `ApplyToAllCacheEntries` and changes `ApplyToAllEntries` signature for Deleter->CacheItemHelper change.
## typed_cache.h
Adds various "typed" interfaces to the Cache as internal APIs, so that most uses of Cache can use simple type safe code without casting and without explicit deleters, etc. Almost all of the non-test, non-glue code uses of Cache have been migrated. (Follow-up work: CompressedSecondaryCache deserves deeper attention to migrate.) This change expands RocksDB's internal usage of metaprogramming and SFINAE (https://en.cppreference.com/w/cpp/language/sfinae).
The existing usages of Cache are divided up at a high level into these new interfaces. See updated existing uses of Cache for examples of how these are used.
* PlaceholderCacheInterface - Used for making cache reservations, with entries that have a charge but no value.
* BasicTypedCacheInterface<TValue> - Used for primary cache storage of objects of type TValue, which can be cleaned up with std::default_delete<TValue>. The role is provided by TValue::kCacheEntryRole or given in an optional template parameter.
* FullTypedCacheInterface<TValue, TCreateContext> - Used for secondary cache compatible storage of objects of type TValue. In addition to BasicTypedCacheInterface constraints, we require TValue::ContentSlice() to return persistable data. This simplifies usage for the normal case of simple secondary cache compatibility (can give you a Slice to the data already in memory). In addition to TCreateContext performing the role of Cache::CreateContext, it is also expected to provide a factory function for creating TValue.
* For each of these, there's a "Shared" version (e.g. FullTypedSharedCacheInterface) that holds a shared_ptr to the Cache, rather than assuming external ownership by holding only a raw `Cache*`.
These interfaces introduce specific handle types for each interface instantiation, so that it's easy to see what kind of object is controlled by a handle. (Ultimately, this might not be worth the extra complexity, but it seems OK so far.)
Note: I attempted to make the cache 'charge' automatically inferred from the cache object type, such as by expecting an ApproximateMemoryUsage() function, but this is not so clean because there are cases where we need to compute the charge ahead of time and don't want to re-compute it.
## block_cache.h
This header is essentially the replacement for the old block_like_traits.h. It includes various things to support block cache access with typed_cache.h for block-based table.
## block_based_table_reader.cc
Before this change, accessing the block cache here was an awkward mix of static polymorphism (template TBlocklike) and switch-case on a dynamic BlockType value. This change mostly unifies on static polymorphism, relying on minor hacks in block_cache.h to distinguish variants of Block. We still check BlockType in some places (especially for stats, which could be improved in follow-up work) but at least the BlockType is a static constant from the template parameter. (No more awkward partial redundancy between static and dynamic info.) This likely contributes to the overall performance improvement, but hasn't been tested in isolation.
The other key source of simplification here is a more unified system of creating block cache objects: for directly populating from primary cache and for promotion from secondary cache. Both use BlockCreateContext, for context and for factory functions.
## block_based_table_builder.cc, cache_dump_load_impl.cc
Before this change, warming caches was super ugly code. Both of these source files had switch statements to basically transition from the dynamic BlockType world to the static TBlocklike world. None of that mess is needed anymore as there's a new, untyped WarmInCache function that handles all the details just as promotion from SecondaryCache would. (Fixes `TODO akanksha: Dedup below code` in block_based_table_builder.cc.)
## Everything else
Mostly just updating Cache users to use new typed APIs when reasonably possible, or changed Cache APIs when not.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10975
Test Plan:
tests updated
Performance test setup similar to https://github.com/facebook/rocksdb/issues/10626 (by cache size, LRUCache when not "hyper" for HyperClockCache):
34MB 1thread base.hyper -> kops/s: 0.745 io_bytes/op: 2.52504e+06 miss_ratio: 0.140906 max_rss_mb: 76.4844
34MB 1thread new.hyper -> kops/s: 0.751 io_bytes/op: 2.5123e+06 miss_ratio: 0.140161 max_rss_mb: 79.3594
34MB 1thread base -> kops/s: 0.254 io_bytes/op: 1.36073e+07 miss_ratio: 0.918818 max_rss_mb: 45.9297
34MB 1thread new -> kops/s: 0.252 io_bytes/op: 1.36157e+07 miss_ratio: 0.918999 max_rss_mb: 44.1523
34MB 32thread base.hyper -> kops/s: 7.272 io_bytes/op: 2.88323e+06 miss_ratio: 0.162532 max_rss_mb: 516.602
34MB 32thread new.hyper -> kops/s: 7.214 io_bytes/op: 2.99046e+06 miss_ratio: 0.168818 max_rss_mb: 518.293
34MB 32thread base -> kops/s: 3.528 io_bytes/op: 1.35722e+07 miss_ratio: 0.914691 max_rss_mb: 264.926
34MB 32thread new -> kops/s: 3.604 io_bytes/op: 1.35744e+07 miss_ratio: 0.915054 max_rss_mb: 264.488
233MB 1thread base.hyper -> kops/s: 53.909 io_bytes/op: 2552.35 miss_ratio: 0.0440566 max_rss_mb: 241.984
233MB 1thread new.hyper -> kops/s: 62.792 io_bytes/op: 2549.79 miss_ratio: 0.044043 max_rss_mb: 241.922
233MB 1thread base -> kops/s: 1.197 io_bytes/op: 2.75173e+06 miss_ratio: 0.103093 max_rss_mb: 241.559
233MB 1thread new -> kops/s: 1.199 io_bytes/op: 2.73723e+06 miss_ratio: 0.10305 max_rss_mb: 240.93
233MB 32thread base.hyper -> kops/s: 1298.69 io_bytes/op: 2539.12 miss_ratio: 0.0440307 max_rss_mb: 371.418
233MB 32thread new.hyper -> kops/s: 1421.35 io_bytes/op: 2538.75 miss_ratio: 0.0440307 max_rss_mb: 347.273
233MB 32thread base -> kops/s: 9.693 io_bytes/op: 2.77304e+06 miss_ratio: 0.103745 max_rss_mb: 569.691
233MB 32thread new -> kops/s: 9.75 io_bytes/op: 2.77559e+06 miss_ratio: 0.103798 max_rss_mb: 552.82
1597MB 1thread base.hyper -> kops/s: 58.607 io_bytes/op: 1449.14 miss_ratio: 0.0249324 max_rss_mb: 1583.55
1597MB 1thread new.hyper -> kops/s: 69.6 io_bytes/op: 1434.89 miss_ratio: 0.0247167 max_rss_mb: 1584.02
1597MB 1thread base -> kops/s: 60.478 io_bytes/op: 1421.28 miss_ratio: 0.024452 max_rss_mb: 1589.45
1597MB 1thread new -> kops/s: 63.973 io_bytes/op: 1416.07 miss_ratio: 0.0243766 max_rss_mb: 1589.24
1597MB 32thread base.hyper -> kops/s: 1436.2 io_bytes/op: 1357.93 miss_ratio: 0.0235353 max_rss_mb: 1692.92
1597MB 32thread new.hyper -> kops/s: 1605.03 io_bytes/op: 1358.04 miss_ratio: 0.023538 max_rss_mb: 1702.78
1597MB 32thread base -> kops/s: 280.059 io_bytes/op: 1350.34 miss_ratio: 0.023289 max_rss_mb: 1675.36
1597MB 32thread new -> kops/s: 283.125 io_bytes/op: 1351.05 miss_ratio: 0.0232797 max_rss_mb: 1703.83
Almost uniformly improving over base revision, especially for hot paths with HyperClockCache, up to 12% higher throughput seen (1597MB, 32thread, hyper). The improvement for that is likely coming from much simplified code for providing context for secondary cache promotion (CreateCallback/CreateContext), and possibly from less branching in block_based_table_reader. And likely a small improvement from not reconstituting key for DeleterFn.
Reviewed By: anand1976
Differential Revision: D42417818
Pulled By: pdillinger
fbshipit-source-id: f86bfdd584dce27c028b151ba56818ad14f7a432
2023-01-11 22:20:40 +00:00
|
|
|
|
2019-07-01 03:52:34 +00:00
|
|
|
int num_lookups() { return num_lookups_; }
|
|
|
|
|
|
|
|
int num_found() { return num_found_; }
|
|
|
|
|
|
|
|
int num_inserts() { return num_inserts_; }
|
2019-09-20 19:00:55 +00:00
|
|
|
|
2019-07-01 03:52:34 +00:00
|
|
|
private:
|
|
|
|
int num_lookups_;
|
|
|
|
int num_found_;
|
|
|
|
int num_inserts_;
|
|
|
|
};
|
|
|
|
|
|
|
|
std::shared_ptr<MyBlockCache> compressed_cache_;
|
|
|
|
std::shared_ptr<MyBlockCache> uncompressed_cache_;
|
2020-12-10 00:57:54 +00:00
|
|
|
Options options_;
|
2019-07-01 03:52:34 +00:00
|
|
|
bool compression_enabled_;
|
|
|
|
std::vector<std::string> values_;
|
Merge adjacent file block reads in RocksDB MultiGet() and Add uncompressed block to cache (#6089)
Summary:
In the current MultiGet, if the KV-pairs do not belong to the data blocks in the block cache, multiple blocks are read from a SST. It will trigger one block read for each block request and read them in parallel. In some cases, if some data blocks are adjacent in the SST, the reads for these blocks can be combined to a single large read, which can reduce the system calls and reduce the read latency if possible.
Considering to fill the block cache, if multiple data blocks are in the same memory buffer, we need to copy them to the heap separately. Therefore, only in the case that 1) data block compression is enabled, and 2) compressed block cache is null, we can do combined read. Otherwise, extra memory copy is needed, which may cause extra overhead. In the current case, data blocks will be uncompressed to a new memory space.
Also, in the case that 1) data block compression is enabled, and 2) compressed block cache is null, it is possible the data block is actually not compressed. In the current logic, these data blocks will not be added to the uncompressed_cache. So if memory buffer is shared and the data block is not compressed, the data block are copied to the head and fill the cache.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6089
Test Plan: Added test case to ParallelIO.MultiGet. Pass make asan_check
Differential Revision: D18734668
Pulled By: zhichao-cao
fbshipit-source-id: 67c5615ed373e51e42635fd74b36f8f3a66d5da4
2019-12-16 23:55:33 +00:00
|
|
|
std::vector<std::string> uncompressable_values_;
|
2019-07-01 03:52:34 +00:00
|
|
|
bool fill_cache_;
|
2020-04-21 21:49:50 +00:00
|
|
|
std::vector<std::string> cf_names_;
|
|
|
|
};
|
|
|
|
|
2023-01-25 01:09:19 +00:00
|
|
|
class DBBasicTestWithParallelIO : public DBBasicTestMultiGet,
|
|
|
|
public testing::WithParamInterface<
|
|
|
|
std::tuple<bool, bool, bool, uint32_t>> {
|
2020-04-21 21:49:50 +00:00
|
|
|
public:
|
|
|
|
DBBasicTestWithParallelIO()
|
|
|
|
: DBBasicTestMultiGet("/db_basic_test_with_parallel_io", 1,
|
|
|
|
std::get<0>(GetParam()), std::get<1>(GetParam()),
|
2023-01-25 01:09:19 +00:00
|
|
|
std::get<2>(GetParam()), std::get<3>(GetParam())) {}
|
2019-07-01 03:52:34 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
TEST_P(DBBasicTestWithParallelIO, MultiGet) {
|
|
|
|
std::vector<std::string> key_data(10);
|
|
|
|
std::vector<Slice> keys;
|
|
|
|
// We cannot resize a PinnableSlice vector, so just set initial size to
|
|
|
|
// largest we think we will need
|
|
|
|
std::vector<PinnableSlice> values(10);
|
|
|
|
std::vector<Status> statuses;
|
|
|
|
ReadOptions ro;
|
|
|
|
ro.fill_cache = fill_cache();
|
|
|
|
|
|
|
|
// Warm up the cache first
|
|
|
|
key_data.emplace_back(Key(0));
|
|
|
|
keys.emplace_back(Slice(key_data.back()));
|
|
|
|
key_data.emplace_back(Key(50));
|
|
|
|
keys.emplace_back(Slice(key_data.back()));
|
|
|
|
statuses.resize(keys.size());
|
|
|
|
|
|
|
|
dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
|
2019-09-20 19:00:55 +00:00
|
|
|
keys.data(), values.data(), statuses.data(), true);
|
2019-07-01 03:52:34 +00:00
|
|
|
ASSERT_TRUE(CheckValue(0, values[0].ToString()));
|
|
|
|
ASSERT_TRUE(CheckValue(50, values[1].ToString()));
|
|
|
|
|
|
|
|
int random_reads = env_->random_read_counter_.Read();
|
|
|
|
key_data[0] = Key(1);
|
|
|
|
key_data[1] = Key(51);
|
|
|
|
keys[0] = Slice(key_data[0]);
|
|
|
|
keys[1] = Slice(key_data[1]);
|
|
|
|
values[0].Reset();
|
|
|
|
values[1].Reset();
|
|
|
|
dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
|
2019-09-20 19:00:55 +00:00
|
|
|
keys.data(), values.data(), statuses.data(), true);
|
2019-07-01 03:52:34 +00:00
|
|
|
ASSERT_TRUE(CheckValue(1, values[0].ToString()));
|
|
|
|
ASSERT_TRUE(CheckValue(51, values[1].ToString()));
|
|
|
|
|
2019-11-07 20:00:45 +00:00
|
|
|
bool read_from_cache = false;
|
|
|
|
if (fill_cache()) {
|
|
|
|
if (has_uncompressed_cache()) {
|
|
|
|
read_from_cache = true;
|
|
|
|
} else if (has_compressed_cache() && compression_enabled()) {
|
|
|
|
read_from_cache = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
int expected_reads = random_reads + (read_from_cache ? 0 : 2);
|
2019-07-01 03:52:34 +00:00
|
|
|
ASSERT_EQ(env_->random_read_counter_.Read(), expected_reads);
|
|
|
|
|
|
|
|
keys.resize(10);
|
|
|
|
statuses.resize(10);
|
2019-09-20 19:00:55 +00:00
|
|
|
std::vector<int> key_ints{1, 2, 15, 16, 55, 81, 82, 83, 84, 85};
|
2019-07-01 03:52:34 +00:00
|
|
|
for (size_t i = 0; i < key_ints.size(); ++i) {
|
|
|
|
key_data[i] = Key(key_ints[i]);
|
|
|
|
keys[i] = Slice(key_data[i]);
|
|
|
|
statuses[i] = Status::OK();
|
|
|
|
values[i].Reset();
|
|
|
|
}
|
|
|
|
dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
|
2019-09-20 19:00:55 +00:00
|
|
|
keys.data(), values.data(), statuses.data(), true);
|
2019-07-01 03:52:34 +00:00
|
|
|
for (size_t i = 0; i < key_ints.size(); ++i) {
|
|
|
|
ASSERT_OK(statuses[i]);
|
|
|
|
ASSERT_TRUE(CheckValue(key_ints[i], values[i].ToString()));
|
|
|
|
}
|
Merge adjacent file block reads in RocksDB MultiGet() and Add uncompressed block to cache (#6089)
Summary:
In the current MultiGet, if the KV-pairs do not belong to the data blocks in the block cache, multiple blocks are read from a SST. It will trigger one block read for each block request and read them in parallel. In some cases, if some data blocks are adjacent in the SST, the reads for these blocks can be combined to a single large read, which can reduce the system calls and reduce the read latency if possible.
Considering to fill the block cache, if multiple data blocks are in the same memory buffer, we need to copy them to the heap separately. Therefore, only in the case that 1) data block compression is enabled, and 2) compressed block cache is null, we can do combined read. Otherwise, extra memory copy is needed, which may cause extra overhead. In the current case, data blocks will be uncompressed to a new memory space.
Also, in the case that 1) data block compression is enabled, and 2) compressed block cache is null, it is possible the data block is actually not compressed. In the current logic, these data blocks will not be added to the uncompressed_cache. So if memory buffer is shared and the data block is not compressed, the data block are copied to the head and fill the cache.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6089
Test Plan: Added test case to ParallelIO.MultiGet. Pass make asan_check
Differential Revision: D18734668
Pulled By: zhichao-cao
fbshipit-source-id: 67c5615ed373e51e42635fd74b36f8f3a66d5da4
2019-12-16 23:55:33 +00:00
|
|
|
if (compression_enabled() && !has_compressed_cache()) {
|
|
|
|
expected_reads += (read_from_cache ? 2 : 3);
|
|
|
|
} else {
|
|
|
|
expected_reads += (read_from_cache ? 2 : 4);
|
|
|
|
}
|
|
|
|
ASSERT_EQ(env_->random_read_counter_.Read(), expected_reads);
|
|
|
|
|
|
|
|
keys.resize(10);
|
|
|
|
statuses.resize(10);
|
|
|
|
std::vector<int> key_uncmp{1, 2, 15, 16, 55, 81, 82, 83, 84, 85};
|
|
|
|
for (size_t i = 0; i < key_uncmp.size(); ++i) {
|
|
|
|
key_data[i] = "a" + Key(key_uncmp[i]);
|
|
|
|
keys[i] = Slice(key_data[i]);
|
|
|
|
statuses[i] = Status::OK();
|
|
|
|
values[i].Reset();
|
|
|
|
}
|
|
|
|
dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
|
|
|
|
keys.data(), values.data(), statuses.data(), true);
|
|
|
|
for (size_t i = 0; i < key_uncmp.size(); ++i) {
|
|
|
|
ASSERT_OK(statuses[i]);
|
|
|
|
ASSERT_TRUE(CheckUncompressableValue(key_uncmp[i], values[i].ToString()));
|
|
|
|
}
|
|
|
|
if (compression_enabled() && !has_compressed_cache()) {
|
|
|
|
expected_reads += (read_from_cache ? 3 : 3);
|
|
|
|
} else {
|
|
|
|
expected_reads += (read_from_cache ? 4 : 4);
|
|
|
|
}
|
2019-07-01 03:52:34 +00:00
|
|
|
ASSERT_EQ(env_->random_read_counter_.Read(), expected_reads);
|
Merge adjacent file block reads in RocksDB MultiGet() and Add uncompressed block to cache (#6089)
Summary:
In the current MultiGet, if the KV-pairs do not belong to the data blocks in the block cache, multiple blocks are read from a SST. It will trigger one block read for each block request and read them in parallel. In some cases, if some data blocks are adjacent in the SST, the reads for these blocks can be combined to a single large read, which can reduce the system calls and reduce the read latency if possible.
Considering to fill the block cache, if multiple data blocks are in the same memory buffer, we need to copy them to the heap separately. Therefore, only in the case that 1) data block compression is enabled, and 2) compressed block cache is null, we can do combined read. Otherwise, extra memory copy is needed, which may cause extra overhead. In the current case, data blocks will be uncompressed to a new memory space.
Also, in the case that 1) data block compression is enabled, and 2) compressed block cache is null, it is possible the data block is actually not compressed. In the current logic, these data blocks will not be added to the uncompressed_cache. So if memory buffer is shared and the data block is not compressed, the data block are copied to the head and fill the cache.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6089
Test Plan: Added test case to ParallelIO.MultiGet. Pass make asan_check
Differential Revision: D18734668
Pulled By: zhichao-cao
fbshipit-source-id: 67c5615ed373e51e42635fd74b36f8f3a66d5da4
2019-12-16 23:55:33 +00:00
|
|
|
|
|
|
|
keys.resize(5);
|
|
|
|
statuses.resize(5);
|
|
|
|
std::vector<int> key_tr{1, 2, 15, 16, 55};
|
|
|
|
for (size_t i = 0; i < key_tr.size(); ++i) {
|
|
|
|
key_data[i] = "a" + Key(key_tr[i]);
|
|
|
|
keys[i] = Slice(key_data[i]);
|
|
|
|
statuses[i] = Status::OK();
|
|
|
|
values[i].Reset();
|
|
|
|
}
|
|
|
|
dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
|
|
|
|
keys.data(), values.data(), statuses.data(), true);
|
|
|
|
for (size_t i = 0; i < key_tr.size(); ++i) {
|
|
|
|
ASSERT_OK(statuses[i]);
|
|
|
|
ASSERT_TRUE(CheckUncompressableValue(key_tr[i], values[i].ToString()));
|
|
|
|
}
|
|
|
|
if (compression_enabled() && !has_compressed_cache()) {
|
|
|
|
expected_reads += (read_from_cache ? 0 : 2);
|
|
|
|
ASSERT_EQ(env_->random_read_counter_.Read(), expected_reads);
|
|
|
|
} else {
|
|
|
|
if (has_uncompressed_cache()) {
|
|
|
|
expected_reads += (read_from_cache ? 0 : 3);
|
|
|
|
ASSERT_EQ(env_->random_read_counter_.Read(), expected_reads);
|
|
|
|
} else {
|
|
|
|
// A rare case, even we enable the block compression but some of data
|
|
|
|
// blocks are not compressed due to content. If user only enable the
|
|
|
|
// compressed cache, the uncompressed blocks will not tbe cached, and
|
|
|
|
// block reads will be triggered. The number of reads is related to
|
|
|
|
// the compression algorithm.
|
|
|
|
ASSERT_TRUE(env_->random_read_counter_.Read() >= expected_reads);
|
|
|
|
}
|
|
|
|
}
|
2019-07-01 03:52:34 +00:00
|
|
|
}
|
|
|
|
|
2020-12-10 00:57:54 +00:00
|
|
|
TEST_P(DBBasicTestWithParallelIO, MultiGetDirectIO) {
|
|
|
|
class FakeDirectIOEnv : public EnvWrapper {
|
|
|
|
class FakeDirectIOSequentialFile;
|
|
|
|
class FakeDirectIORandomAccessFile;
|
|
|
|
|
|
|
|
public:
|
|
|
|
FakeDirectIOEnv(Env* env) : EnvWrapper(env) {}
|
2022-01-05 00:44:54 +00:00
|
|
|
static const char* kClassName() { return "FakeDirectIOEnv"; }
|
|
|
|
const char* Name() const override { return kClassName(); }
|
2020-12-10 00:57:54 +00:00
|
|
|
|
|
|
|
Status NewRandomAccessFile(const std::string& fname,
|
|
|
|
std::unique_ptr<RandomAccessFile>* result,
|
|
|
|
const EnvOptions& options) override {
|
|
|
|
std::unique_ptr<RandomAccessFile> file;
|
|
|
|
assert(options.use_direct_reads);
|
|
|
|
EnvOptions opts = options;
|
|
|
|
opts.use_direct_reads = false;
|
|
|
|
Status s = target()->NewRandomAccessFile(fname, &file, opts);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
result->reset(new FakeDirectIORandomAccessFile(std::move(file)));
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
class FakeDirectIOSequentialFile : public SequentialFileWrapper {
|
|
|
|
public:
|
|
|
|
FakeDirectIOSequentialFile(std::unique_ptr<SequentialFile>&& file)
|
|
|
|
: SequentialFileWrapper(file.get()), file_(std::move(file)) {}
|
|
|
|
~FakeDirectIOSequentialFile() {}
|
|
|
|
|
|
|
|
bool use_direct_io() const override { return true; }
|
|
|
|
size_t GetRequiredBufferAlignment() const override { return 1; }
|
|
|
|
|
|
|
|
private:
|
|
|
|
std::unique_ptr<SequentialFile> file_;
|
|
|
|
};
|
|
|
|
|
|
|
|
class FakeDirectIORandomAccessFile : public RandomAccessFileWrapper {
|
|
|
|
public:
|
|
|
|
FakeDirectIORandomAccessFile(std::unique_ptr<RandomAccessFile>&& file)
|
|
|
|
: RandomAccessFileWrapper(file.get()), file_(std::move(file)) {}
|
|
|
|
~FakeDirectIORandomAccessFile() {}
|
|
|
|
|
|
|
|
bool use_direct_io() const override { return true; }
|
|
|
|
size_t GetRequiredBufferAlignment() const override { return 1; }
|
|
|
|
|
|
|
|
private:
|
|
|
|
std::unique_ptr<RandomAccessFile> file_;
|
|
|
|
};
|
|
|
|
};
|
|
|
|
|
|
|
|
std::unique_ptr<FakeDirectIOEnv> env(new FakeDirectIOEnv(env_));
|
|
|
|
Options opts = get_options();
|
|
|
|
opts.env = env.get();
|
|
|
|
opts.use_direct_reads = true;
|
|
|
|
Reopen(opts);
|
|
|
|
|
|
|
|
std::vector<std::string> key_data(10);
|
|
|
|
std::vector<Slice> keys;
|
|
|
|
// We cannot resize a PinnableSlice vector, so just set initial size to
|
|
|
|
// largest we think we will need
|
|
|
|
std::vector<PinnableSlice> values(10);
|
|
|
|
std::vector<Status> statuses;
|
|
|
|
ReadOptions ro;
|
|
|
|
ro.fill_cache = fill_cache();
|
|
|
|
|
|
|
|
// Warm up the cache first
|
|
|
|
key_data.emplace_back(Key(0));
|
|
|
|
keys.emplace_back(Slice(key_data.back()));
|
|
|
|
key_data.emplace_back(Key(50));
|
|
|
|
keys.emplace_back(Slice(key_data.back()));
|
|
|
|
statuses.resize(keys.size());
|
|
|
|
|
|
|
|
dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
|
|
|
|
keys.data(), values.data(), statuses.data(), true);
|
|
|
|
ASSERT_TRUE(CheckValue(0, values[0].ToString()));
|
|
|
|
ASSERT_TRUE(CheckValue(50, values[1].ToString()));
|
|
|
|
|
|
|
|
int random_reads = env_->random_read_counter_.Read();
|
|
|
|
key_data[0] = Key(1);
|
|
|
|
key_data[1] = Key(51);
|
|
|
|
keys[0] = Slice(key_data[0]);
|
|
|
|
keys[1] = Slice(key_data[1]);
|
|
|
|
values[0].Reset();
|
|
|
|
values[1].Reset();
|
|
|
|
if (uncompressed_cache_) {
|
|
|
|
uncompressed_cache_->SetCapacity(0);
|
|
|
|
uncompressed_cache_->SetCapacity(1048576);
|
|
|
|
}
|
|
|
|
dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
|
|
|
|
keys.data(), values.data(), statuses.data(), true);
|
|
|
|
ASSERT_TRUE(CheckValue(1, values[0].ToString()));
|
|
|
|
ASSERT_TRUE(CheckValue(51, values[1].ToString()));
|
|
|
|
|
|
|
|
bool read_from_cache = false;
|
|
|
|
if (fill_cache()) {
|
|
|
|
if (has_uncompressed_cache()) {
|
|
|
|
read_from_cache = true;
|
|
|
|
} else if (has_compressed_cache() && compression_enabled()) {
|
|
|
|
read_from_cache = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
int expected_reads = random_reads;
|
|
|
|
if (!compression_enabled() || !has_compressed_cache()) {
|
|
|
|
expected_reads += 2;
|
|
|
|
} else {
|
|
|
|
expected_reads += (read_from_cache ? 0 : 2);
|
|
|
|
}
|
|
|
|
if (env_->random_read_counter_.Read() != expected_reads) {
|
|
|
|
ASSERT_EQ(env_->random_read_counter_.Read(), expected_reads);
|
|
|
|
}
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
2020-02-12 01:25:10 +00:00
|
|
|
TEST_P(DBBasicTestWithParallelIO, MultiGetWithChecksumMismatch) {
|
|
|
|
std::vector<std::string> key_data(10);
|
|
|
|
std::vector<Slice> keys;
|
|
|
|
// We cannot resize a PinnableSlice vector, so just set initial size to
|
|
|
|
// largest we think we will need
|
|
|
|
std::vector<PinnableSlice> values(10);
|
|
|
|
std::vector<Status> statuses;
|
|
|
|
int read_count = 0;
|
|
|
|
ReadOptions ro;
|
|
|
|
ro.fill_cache = fill_cache();
|
|
|
|
|
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
2020-03-24 18:21:10 +00:00
|
|
|
"RetrieveMultipleBlocks:VerifyChecksum", [&](void* status) {
|
|
|
|
Status* s = static_cast<Status*>(status);
|
|
|
|
read_count++;
|
|
|
|
if (read_count == 2) {
|
|
|
|
*s = Status::Corruption();
|
|
|
|
}
|
|
|
|
});
|
2020-02-12 01:25:10 +00:00
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
// Warm up the cache first
|
|
|
|
key_data.emplace_back(Key(0));
|
|
|
|
keys.emplace_back(Slice(key_data.back()));
|
|
|
|
key_data.emplace_back(Key(50));
|
|
|
|
keys.emplace_back(Slice(key_data.back()));
|
|
|
|
statuses.resize(keys.size());
|
|
|
|
|
|
|
|
dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
|
|
|
|
keys.data(), values.data(), statuses.data(), true);
|
|
|
|
ASSERT_TRUE(CheckValue(0, values[0].ToString()));
|
2020-03-24 18:21:10 +00:00
|
|
|
// ASSERT_TRUE(CheckValue(50, values[1].ToString()));
|
2020-02-12 01:25:10 +00:00
|
|
|
ASSERT_EQ(statuses[0], Status::OK());
|
|
|
|
ASSERT_EQ(statuses[1], Status::Corruption());
|
|
|
|
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBBasicTestWithParallelIO, MultiGetWithMissingFile) {
|
|
|
|
std::vector<std::string> key_data(10);
|
|
|
|
std::vector<Slice> keys;
|
|
|
|
// We cannot resize a PinnableSlice vector, so just set initial size to
|
|
|
|
// largest we think we will need
|
|
|
|
std::vector<PinnableSlice> values(10);
|
|
|
|
std::vector<Status> statuses;
|
|
|
|
ReadOptions ro;
|
|
|
|
ro.fill_cache = fill_cache();
|
|
|
|
|
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
2020-03-24 18:21:10 +00:00
|
|
|
"TableCache::MultiGet:FindTable", [&](void* status) {
|
|
|
|
Status* s = static_cast<Status*>(status);
|
|
|
|
*s = Status::IOError();
|
|
|
|
});
|
2020-02-12 01:25:10 +00:00
|
|
|
// DB open will create table readers unless we reduce the table cache
|
|
|
|
// capacity.
|
|
|
|
// SanitizeOptions will set max_open_files to minimum of 20. Table cache
|
|
|
|
// is allocated with max_open_files - 10 as capacity. So override
|
|
|
|
// max_open_files to 11 so table cache capacity will become 1. This will
|
|
|
|
// prevent file open during DB open and force the file to be opened
|
|
|
|
// during MultiGet
|
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
2020-03-24 18:21:10 +00:00
|
|
|
"SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
|
|
|
|
int* max_open_files = (int*)arg;
|
|
|
|
*max_open_files = 11;
|
|
|
|
});
|
2020-02-12 01:25:10 +00:00
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
Reopen(CurrentOptions());
|
|
|
|
|
|
|
|
// Warm up the cache first
|
|
|
|
key_data.emplace_back(Key(0));
|
|
|
|
keys.emplace_back(Slice(key_data.back()));
|
|
|
|
key_data.emplace_back(Key(50));
|
|
|
|
keys.emplace_back(Slice(key_data.back()));
|
|
|
|
statuses.resize(keys.size());
|
|
|
|
|
|
|
|
dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
|
|
|
|
keys.data(), values.data(), statuses.data(), true);
|
|
|
|
ASSERT_EQ(statuses[0], Status::IOError());
|
|
|
|
ASSERT_EQ(statuses[1], Status::IOError());
|
|
|
|
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
}
|
|
|
|
|
2020-06-03 22:53:09 +00:00
|
|
|
INSTANTIATE_TEST_CASE_P(ParallelIO, DBBasicTestWithParallelIO,
|
|
|
|
// Params are as follows -
|
2023-01-25 01:09:19 +00:00
|
|
|
// Param 0 - Uncompressed cache enabled
|
|
|
|
// Param 1 - Data compression enabled
|
|
|
|
// Param 2 - ReadOptions::fill_cache
|
|
|
|
// Param 3 - CompressionOptions::parallel_threads
|
2020-06-03 22:53:09 +00:00
|
|
|
::testing::Combine(::testing::Bool(), ::testing::Bool(),
|
2023-01-25 01:09:19 +00:00
|
|
|
::testing::Bool(),
|
2020-06-03 22:53:09 +00:00
|
|
|
::testing::Values(1, 4)));
|
2019-06-06 06:07:28 +00:00
|
|
|
|
2020-06-29 21:51:57 +00:00
|
|
|
// Forward declaration
|
|
|
|
class DeadlineFS;
|
|
|
|
|
2021-09-13 15:45:13 +00:00
|
|
|
class DeadlineRandomAccessFile : public FSRandomAccessFileOwnerWrapper {
|
2020-04-21 21:49:50 +00:00
|
|
|
public:
|
2020-08-07 18:59:19 +00:00
|
|
|
DeadlineRandomAccessFile(DeadlineFS& fs,
|
2020-06-29 21:51:57 +00:00
|
|
|
std::unique_ptr<FSRandomAccessFile>& file)
|
2021-09-13 15:45:13 +00:00
|
|
|
: FSRandomAccessFileOwnerWrapper(std::move(file)), fs_(fs) {}
|
2020-04-21 21:49:50 +00:00
|
|
|
|
2020-06-29 21:51:57 +00:00
|
|
|
IOStatus Read(uint64_t offset, size_t len, const IOOptions& opts,
|
|
|
|
Slice* result, char* scratch,
|
|
|
|
IODebugContext* dbg) const override;
|
2020-04-21 21:49:50 +00:00
|
|
|
|
2020-06-29 21:51:57 +00:00
|
|
|
IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs,
|
|
|
|
const IOOptions& options, IODebugContext* dbg) override;
|
2020-04-21 21:49:50 +00:00
|
|
|
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
IOStatus ReadAsync(FSReadRequest& req, const IOOptions& opts,
|
|
|
|
std::function<void(const FSReadRequest&, void*)> cb,
|
|
|
|
void* cb_arg, void** io_handle, IOHandleDeleter* del_fn,
|
|
|
|
IODebugContext* dbg) override;
|
|
|
|
|
2020-06-29 21:51:57 +00:00
|
|
|
private:
|
|
|
|
DeadlineFS& fs_;
|
|
|
|
std::unique_ptr<FSRandomAccessFile> file_;
|
|
|
|
};
|
2020-04-21 21:49:50 +00:00
|
|
|
|
2020-06-29 21:51:57 +00:00
|
|
|
class DeadlineFS : public FileSystemWrapper {
|
|
|
|
public:
|
2020-08-07 18:59:19 +00:00
|
|
|
// The error_on_delay parameter specifies whether a IOStatus::TimedOut()
|
|
|
|
// status should be returned after delaying the IO to exceed the timeout,
|
|
|
|
// or to simply delay but return success anyway. The latter mimics the
|
|
|
|
// behavior of PosixFileSystem, which does not enforce any timeout
|
|
|
|
explicit DeadlineFS(SpecialEnv* env, bool error_on_delay)
|
Fix many tests to run with MEM_ENV and ENCRYPTED_ENV; Introduce a MemoryFileSystem class (#7566)
Summary:
This PR does a few things:
1. The MockFileSystem class was split out from the MockEnv. This change would theoretically allow a MockFileSystem to be used by other Environments as well (if we created a means of constructing one). The MockFileSystem implements a FileSystem in its entirety and does not rely on any Wrapper implementation.
2. Make the RocksDB test suite work when MOCK_ENV=1 and ENCRYPTED_ENV=1 are set. To accomplish this, a few things were needed:
- The tests that tried to use the "wrong" environment (Env::Default() instead of env_) were updated
- The MockFileSystem was changed to support the features it was missing or mishandled (such as recursively deleting files in a directory or supporting renaming of a directory).
3. Updated the test framework to have a ROCKSDB_GTEST_SKIP macro. This can be used to flag tests that are skipped. Currently, this defaults to doing nothing (marks the test as SUCCESS) but will mark the tests as SKIPPED when RocksDB is upgraded to a version of gtest that supports this (gtest-1.10).
I have run a full "make check" with MEM_ENV, ENCRYPTED_ENV, both, and neither under both MacOS and RedHat. A few tests were disabled/skipped for the MEM/ENCRYPTED cases. The error_handler_fs_test fails/hangs for MEM_ENV (presumably a timing problem) and I will introduce another PR/issue to track that problem. (I will also push a change to disable those tests soon). There is one more test in DBTest2 that also fails which I need to investigate or skip before this PR is merged.
Theoretically, this PR should also allow the test suite to run against an Env loaded from the registry, though I do not have one to try it with currently.
Finally, once this is accepted, it would be nice if there was a CircleCI job to run these tests on a checkin so this effort does not become stale. I do not know how to do that, so if someone could write that job, it would be appreciated :)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7566
Reviewed By: zhichao-cao
Differential Revision: D24408980
Pulled By: jay-zhuang
fbshipit-source-id: 911b1554a4d0da06fd51feca0c090a4abdcb4a5f
2020-10-27 17:31:34 +00:00
|
|
|
: FileSystemWrapper(env->GetFileSystem()),
|
2020-06-29 21:51:57 +00:00
|
|
|
deadline_(std::chrono::microseconds::zero()),
|
2020-08-07 18:59:19 +00:00
|
|
|
io_timeout_(std::chrono::microseconds::zero()),
|
2020-06-29 21:51:57 +00:00
|
|
|
env_(env),
|
|
|
|
timedout_(false),
|
2020-08-07 18:59:19 +00:00
|
|
|
ignore_deadline_(false),
|
|
|
|
error_on_delay_(error_on_delay) {}
|
2020-06-29 21:51:57 +00:00
|
|
|
|
2021-11-02 16:06:02 +00:00
|
|
|
static const char* kClassName() { return "DeadlineFileSystem"; }
|
|
|
|
const char* Name() const override { return kClassName(); }
|
|
|
|
|
2020-06-29 21:51:57 +00:00
|
|
|
IOStatus NewRandomAccessFile(const std::string& fname,
|
|
|
|
const FileOptions& opts,
|
|
|
|
std::unique_ptr<FSRandomAccessFile>* result,
|
|
|
|
IODebugContext* dbg) override {
|
|
|
|
std::unique_ptr<FSRandomAccessFile> file;
|
2020-09-29 16:47:33 +00:00
|
|
|
IOStatus s = target()->NewRandomAccessFile(fname, opts, &file, dbg);
|
|
|
|
EXPECT_OK(s);
|
2020-08-07 18:59:19 +00:00
|
|
|
result->reset(new DeadlineRandomAccessFile(*this, file));
|
2020-06-29 21:51:57 +00:00
|
|
|
|
|
|
|
const std::chrono::microseconds deadline = GetDeadline();
|
2020-08-07 18:59:19 +00:00
|
|
|
const std::chrono::microseconds io_timeout = GetIOTimeout();
|
|
|
|
if (deadline.count() || io_timeout.count()) {
|
|
|
|
AssertDeadline(deadline, io_timeout, opts.io_options);
|
2020-06-29 21:51:57 +00:00
|
|
|
}
|
2020-08-07 18:59:19 +00:00
|
|
|
return ShouldDelay(opts.io_options);
|
2020-06-29 21:51:57 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Set a vector of {IO counter, delay in microseconds, return status} tuples
|
|
|
|
// that control when to inject a delay and duration of the delay
|
2020-08-07 18:59:19 +00:00
|
|
|
void SetDelayTrigger(const std::chrono::microseconds deadline,
|
|
|
|
const std::chrono::microseconds io_timeout,
|
|
|
|
const int trigger) {
|
|
|
|
delay_trigger_ = trigger;
|
2020-06-29 21:51:57 +00:00
|
|
|
io_count_ = 0;
|
|
|
|
deadline_ = deadline;
|
2020-08-07 18:59:19 +00:00
|
|
|
io_timeout_ = io_timeout;
|
2020-06-29 21:51:57 +00:00
|
|
|
timedout_ = false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Increment the IO counter and return a delay in microseconds
|
2020-08-07 18:59:19 +00:00
|
|
|
IOStatus ShouldDelay(const IOOptions& opts) {
|
2021-01-06 18:48:24 +00:00
|
|
|
if (timedout_) {
|
|
|
|
return IOStatus::TimedOut();
|
|
|
|
} else if (!deadline_.count() && !io_timeout_.count()) {
|
2020-08-07 18:59:19 +00:00
|
|
|
return IOStatus::OK();
|
|
|
|
}
|
|
|
|
if (!ignore_deadline_ && delay_trigger_ == io_count_++) {
|
|
|
|
env_->SleepForMicroseconds(static_cast<int>(opts.timeout.count() + 1));
|
2020-06-29 21:51:57 +00:00
|
|
|
timedout_ = true;
|
2020-08-07 18:59:19 +00:00
|
|
|
if (error_on_delay_) {
|
|
|
|
return IOStatus::TimedOut();
|
|
|
|
}
|
2020-06-29 21:51:57 +00:00
|
|
|
}
|
2020-08-07 18:59:19 +00:00
|
|
|
return IOStatus::OK();
|
2020-06-29 21:51:57 +00:00
|
|
|
}
|
2020-04-21 21:49:50 +00:00
|
|
|
|
2020-06-29 21:51:57 +00:00
|
|
|
const std::chrono::microseconds GetDeadline() {
|
|
|
|
return ignore_deadline_ ? std::chrono::microseconds::zero() : deadline_;
|
|
|
|
}
|
|
|
|
|
2020-08-07 18:59:19 +00:00
|
|
|
const std::chrono::microseconds GetIOTimeout() {
|
|
|
|
return ignore_deadline_ ? std::chrono::microseconds::zero() : io_timeout_;
|
|
|
|
}
|
|
|
|
|
2020-06-29 21:51:57 +00:00
|
|
|
bool TimedOut() { return timedout_; }
|
|
|
|
|
|
|
|
void IgnoreDeadline(bool ignore) { ignore_deadline_ = ignore; }
|
|
|
|
|
|
|
|
void AssertDeadline(const std::chrono::microseconds deadline,
|
2020-08-07 18:59:19 +00:00
|
|
|
const std::chrono::microseconds io_timeout,
|
2020-06-29 21:51:57 +00:00
|
|
|
const IOOptions& opts) const {
|
|
|
|
// Give a leeway of +- 10us as it can take some time for the Get/
|
|
|
|
// MultiGet call to reach here, in order to avoid false alarms
|
|
|
|
std::chrono::microseconds now =
|
|
|
|
std::chrono::microseconds(env_->NowMicros());
|
2020-08-07 18:59:19 +00:00
|
|
|
std::chrono::microseconds timeout;
|
|
|
|
if (deadline.count()) {
|
|
|
|
timeout = deadline - now;
|
|
|
|
if (io_timeout.count()) {
|
|
|
|
timeout = std::min(timeout, io_timeout);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
timeout = io_timeout;
|
|
|
|
}
|
|
|
|
if (opts.timeout != timeout) {
|
|
|
|
ASSERT_EQ(timeout, opts.timeout);
|
2020-04-21 21:49:50 +00:00
|
|
|
}
|
2020-06-29 21:51:57 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
2020-08-07 18:59:19 +00:00
|
|
|
// The number of IOs to trigger the delay after
|
|
|
|
int delay_trigger_;
|
|
|
|
// Current IO count
|
2020-06-29 21:51:57 +00:00
|
|
|
int io_count_;
|
2020-08-07 18:59:19 +00:00
|
|
|
// ReadOptions deadline for the Get/MultiGet/Iterator
|
2020-06-29 21:51:57 +00:00
|
|
|
std::chrono::microseconds deadline_;
|
2020-08-07 18:59:19 +00:00
|
|
|
// ReadOptions io_timeout for the Get/MultiGet/Iterator
|
|
|
|
std::chrono::microseconds io_timeout_;
|
2020-06-29 21:51:57 +00:00
|
|
|
SpecialEnv* env_;
|
2020-08-07 18:59:19 +00:00
|
|
|
// Flag to indicate whether we injected a delay
|
2020-06-29 21:51:57 +00:00
|
|
|
bool timedout_;
|
2020-08-07 18:59:19 +00:00
|
|
|
// Temporarily ignore deadlines/timeouts
|
2020-06-29 21:51:57 +00:00
|
|
|
bool ignore_deadline_;
|
2020-08-07 18:59:19 +00:00
|
|
|
// Return IOStatus::TimedOut() or IOStatus::OK()
|
|
|
|
bool error_on_delay_;
|
2020-06-29 21:51:57 +00:00
|
|
|
};
|
2020-04-21 21:49:50 +00:00
|
|
|
|
2020-06-29 21:51:57 +00:00
|
|
|
IOStatus DeadlineRandomAccessFile::Read(uint64_t offset, size_t len,
|
|
|
|
const IOOptions& opts, Slice* result,
|
|
|
|
char* scratch,
|
|
|
|
IODebugContext* dbg) const {
|
|
|
|
const std::chrono::microseconds deadline = fs_.GetDeadline();
|
2020-08-07 18:59:19 +00:00
|
|
|
const std::chrono::microseconds io_timeout = fs_.GetIOTimeout();
|
2020-06-29 21:51:57 +00:00
|
|
|
IOStatus s;
|
2020-08-07 18:59:19 +00:00
|
|
|
if (deadline.count() || io_timeout.count()) {
|
|
|
|
fs_.AssertDeadline(deadline, io_timeout, opts);
|
2020-06-29 21:51:57 +00:00
|
|
|
}
|
|
|
|
if (s.ok()) {
|
|
|
|
s = FSRandomAccessFileWrapper::Read(offset, len, opts, result, scratch,
|
|
|
|
dbg);
|
|
|
|
}
|
2020-08-07 18:59:19 +00:00
|
|
|
if (s.ok()) {
|
|
|
|
s = fs_.ShouldDelay(opts);
|
|
|
|
}
|
2020-06-29 21:51:57 +00:00
|
|
|
return s;
|
|
|
|
}
|
2020-04-30 21:48:51 +00:00
|
|
|
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
IOStatus DeadlineRandomAccessFile::ReadAsync(
|
|
|
|
FSReadRequest& req, const IOOptions& opts,
|
|
|
|
std::function<void(const FSReadRequest&, void*)> cb, void* cb_arg,
|
|
|
|
void** io_handle, IOHandleDeleter* del_fn, IODebugContext* dbg) {
|
|
|
|
const std::chrono::microseconds deadline = fs_.GetDeadline();
|
|
|
|
const std::chrono::microseconds io_timeout = fs_.GetIOTimeout();
|
|
|
|
IOStatus s;
|
|
|
|
if (deadline.count() || io_timeout.count()) {
|
|
|
|
fs_.AssertDeadline(deadline, io_timeout, opts);
|
|
|
|
}
|
|
|
|
if (s.ok()) {
|
|
|
|
s = FSRandomAccessFileWrapper::ReadAsync(req, opts, cb, cb_arg, io_handle,
|
|
|
|
del_fn, dbg);
|
|
|
|
}
|
|
|
|
if (s.ok()) {
|
|
|
|
s = fs_.ShouldDelay(opts);
|
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2020-06-29 21:51:57 +00:00
|
|
|
IOStatus DeadlineRandomAccessFile::MultiRead(FSReadRequest* reqs,
|
|
|
|
size_t num_reqs,
|
|
|
|
const IOOptions& options,
|
|
|
|
IODebugContext* dbg) {
|
|
|
|
const std::chrono::microseconds deadline = fs_.GetDeadline();
|
2020-08-07 18:59:19 +00:00
|
|
|
const std::chrono::microseconds io_timeout = fs_.GetIOTimeout();
|
2020-06-29 21:51:57 +00:00
|
|
|
IOStatus s;
|
2020-08-07 18:59:19 +00:00
|
|
|
if (deadline.count() || io_timeout.count()) {
|
|
|
|
fs_.AssertDeadline(deadline, io_timeout, options);
|
2020-06-29 21:51:57 +00:00
|
|
|
}
|
|
|
|
if (s.ok()) {
|
|
|
|
s = FSRandomAccessFileWrapper::MultiRead(reqs, num_reqs, options, dbg);
|
|
|
|
}
|
2020-08-07 18:59:19 +00:00
|
|
|
if (s.ok()) {
|
|
|
|
s = fs_.ShouldDelay(options);
|
|
|
|
}
|
2020-06-29 21:51:57 +00:00
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
// A test class for intercepting random reads and injecting artificial
|
2020-08-07 18:59:19 +00:00
|
|
|
// delays. Used for testing the MultiGet deadline feature
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
class DBBasicTestMultiGetDeadline : public DBBasicTestMultiGet,
|
|
|
|
public testing::WithParamInterface<bool> {
|
2020-06-29 21:51:57 +00:00
|
|
|
public:
|
|
|
|
DBBasicTestMultiGetDeadline()
|
|
|
|
: DBBasicTestMultiGet(
|
|
|
|
"db_basic_test_multiget_deadline" /*Test dir*/,
|
2023-01-25 01:09:19 +00:00
|
|
|
10 /*# of column families*/, true /*uncompressed cache enabled*/,
|
|
|
|
true /*compression enabled*/, true /*ReadOptions.fill_cache*/,
|
2020-06-29 21:51:57 +00:00
|
|
|
1 /*# of parallel compression threads*/) {}
|
2020-04-21 21:49:50 +00:00
|
|
|
|
|
|
|
inline void CheckStatus(std::vector<Status>& statuses, size_t num_ok) {
|
|
|
|
for (size_t i = 0; i < statuses.size(); ++i) {
|
|
|
|
if (i < num_ok) {
|
|
|
|
EXPECT_OK(statuses[i]);
|
|
|
|
} else {
|
2020-08-07 18:59:19 +00:00
|
|
|
if (statuses[i] != Status::TimedOut()) {
|
|
|
|
EXPECT_EQ(statuses[i], Status::TimedOut());
|
|
|
|
}
|
2020-04-21 21:49:50 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
TEST_P(DBBasicTestMultiGetDeadline, MultiGetDeadlineExceeded) {
|
|
|
|
#ifndef USE_COROUTINES
|
|
|
|
if (GetParam()) {
|
|
|
|
ROCKSDB_GTEST_SKIP("This test requires coroutine support");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
#endif // USE_COROUTINES
|
2020-08-07 18:59:19 +00:00
|
|
|
std::shared_ptr<DeadlineFS> fs = std::make_shared<DeadlineFS>(env_, false);
|
2020-04-30 21:48:51 +00:00
|
|
|
std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
|
2020-04-21 21:49:50 +00:00
|
|
|
Options options = CurrentOptions();
|
|
|
|
|
|
|
|
std::shared_ptr<Cache> cache = NewLRUCache(1048576);
|
|
|
|
BlockBasedTableOptions table_options;
|
|
|
|
table_options.block_cache = cache;
|
Fix many tests to run with MEM_ENV and ENCRYPTED_ENV; Introduce a MemoryFileSystem class (#7566)
Summary:
This PR does a few things:
1. The MockFileSystem class was split out from the MockEnv. This change would theoretically allow a MockFileSystem to be used by other Environments as well (if we created a means of constructing one). The MockFileSystem implements a FileSystem in its entirety and does not rely on any Wrapper implementation.
2. Make the RocksDB test suite work when MOCK_ENV=1 and ENCRYPTED_ENV=1 are set. To accomplish this, a few things were needed:
- The tests that tried to use the "wrong" environment (Env::Default() instead of env_) were updated
- The MockFileSystem was changed to support the features it was missing or mishandled (such as recursively deleting files in a directory or supporting renaming of a directory).
3. Updated the test framework to have a ROCKSDB_GTEST_SKIP macro. This can be used to flag tests that are skipped. Currently, this defaults to doing nothing (marks the test as SUCCESS) but will mark the tests as SKIPPED when RocksDB is upgraded to a version of gtest that supports this (gtest-1.10).
I have run a full "make check" with MEM_ENV, ENCRYPTED_ENV, both, and neither under both MacOS and RedHat. A few tests were disabled/skipped for the MEM/ENCRYPTED cases. The error_handler_fs_test fails/hangs for MEM_ENV (presumably a timing problem) and I will introduce another PR/issue to track that problem. (I will also push a change to disable those tests soon). There is one more test in DBTest2 that also fails which I need to investigate or skip before this PR is merged.
Theoretically, this PR should also allow the test suite to run against an Env loaded from the registry, though I do not have one to try it with currently.
Finally, once this is accepted, it would be nice if there was a CircleCI job to run these tests on a checkin so this effort does not become stale. I do not know how to do that, so if someone could write that job, it would be appreciated :)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7566
Reviewed By: zhichao-cao
Differential Revision: D24408980
Pulled By: jay-zhuang
fbshipit-source-id: 911b1554a4d0da06fd51feca0c090a4abdcb4a5f
2020-10-27 17:31:34 +00:00
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
2020-04-21 21:49:50 +00:00
|
|
|
options.env = env.get();
|
Fix+clean up handling of mock sleeps (#7101)
Summary:
We have a number of tests hanging on MacOS and windows due to
mishandling of code for mock sleeps. In addition, the code was in
terrible shape because the same variable (addon_time_) would sometimes
refer to microseconds and sometimes to seconds. One test even assumed it
was nanoseconds but was written to pass anyway.
This has been cleaned up so that DB tests generally use a SpecialEnv
function to mock sleep, for either some number of microseconds or seconds
depending on the function called. But to call one of these, the test must first
call SetMockSleep (precondition enforced with assertion), which also turns
sleeps in RocksDB into mock sleeps. To also removes accounting for actual
clock time, call SetTimeElapseOnlySleepOnReopen, which implies
SetMockSleep (on DB re-open). This latter setting only works by applying
on DB re-open, otherwise havoc can ensue if Env goes back in time with
DB open.
More specifics:
Removed some unused test classes, and updated comments on the general
problem.
Fixed DBSSTTest.GetTotalSstFilesSize using a sync point callback instead
of mock time. For this we have the only modification to production code,
inserting a sync point callback in flush_job.cc, which is not a change to
production behavior.
Removed unnecessary resetting of mock times to 0 in many tests. RocksDB
deals in relative time. Any behaviors relying on absolute date/time are likely
a bug. (The above test DBSSTTest.GetTotalSstFilesSize was the only one
clearly injecting a specific absolute time for actual testing convenience.) Just
in case I misunderstood some test, I put this note in each replacement:
// NOTE: Presumed unnecessary and removed: resetting mock time in env
Strengthened some tests like MergeTestTime, MergeCompactionTimeTest, and
FilterCompactionTimeTest in db_test.cc
stats_history_test and blob_db_test are each their own beast, rather deeply
dependent on MockTimeEnv. Each gets its own variant of a work-around for
TimedWait in a mock time environment. (Reduces redundancy and
inconsistency in stats_history_test.)
Intended follow-up:
Remove TimedWait from the public API of InstrumentedCondVar, and only
make that accessible through Env by passing in an InstrumentedCondVar and
a deadline. Then the Env implementations mocking time can fix this problem
without using sync points. (Test infrastructure using sync points interferes
with individual tests' control over sync points.)
With that change, we can simplify/consolidate the scattered work-arounds.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7101
Test Plan: make check on Linux and MacOS
Reviewed By: zhichao-cao
Differential Revision: D23032815
Pulled By: pdillinger
fbshipit-source-id: 7f33967ada8b83011fb54e8279365c008bd6610b
2020-08-11 19:39:49 +00:00
|
|
|
SetTimeElapseOnlySleepOnReopen(&options);
|
2020-04-21 21:49:50 +00:00
|
|
|
ReopenWithColumnFamilies(GetCFNames(), options);
|
|
|
|
|
|
|
|
// Test the non-batched version of MultiGet with multiple column
|
|
|
|
// families
|
|
|
|
std::vector<std::string> key_str;
|
|
|
|
size_t i;
|
|
|
|
for (i = 0; i < 5; ++i) {
|
|
|
|
key_str.emplace_back(Key(static_cast<int>(i)));
|
|
|
|
}
|
|
|
|
std::vector<ColumnFamilyHandle*> cfs(key_str.size());
|
|
|
|
;
|
|
|
|
std::vector<Slice> keys(key_str.size());
|
|
|
|
std::vector<std::string> values(key_str.size());
|
|
|
|
for (i = 0; i < key_str.size(); ++i) {
|
|
|
|
cfs[i] = handles_[i];
|
|
|
|
keys[i] = Slice(key_str[i].data(), key_str[i].size());
|
|
|
|
}
|
|
|
|
|
|
|
|
ReadOptions ro;
|
|
|
|
ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000};
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
ro.async_io = GetParam();
|
2020-08-07 18:59:19 +00:00
|
|
|
// Delay the first IO
|
|
|
|
fs->SetDelayTrigger(ro.deadline, ro.io_timeout, 0);
|
2020-04-30 21:48:51 +00:00
|
|
|
|
2020-04-21 21:49:50 +00:00
|
|
|
std::vector<Status> statuses = dbfull()->MultiGet(ro, cfs, keys, &values);
|
|
|
|
// The first key is successful because we check after the lookup, but
|
|
|
|
// subsequent keys fail due to deadline exceeded
|
|
|
|
CheckStatus(statuses, 1);
|
|
|
|
|
|
|
|
// Clear the cache
|
|
|
|
cache->SetCapacity(0);
|
|
|
|
cache->SetCapacity(1048576);
|
|
|
|
// Test non-batched Multiget with multiple column families and
|
|
|
|
// introducing an IO delay in one of the middle CFs
|
|
|
|
key_str.clear();
|
|
|
|
for (i = 0; i < 10; ++i) {
|
|
|
|
key_str.emplace_back(Key(static_cast<int>(i)));
|
|
|
|
}
|
|
|
|
cfs.resize(key_str.size());
|
|
|
|
keys.resize(key_str.size());
|
|
|
|
values.resize(key_str.size());
|
|
|
|
for (i = 0; i < key_str.size(); ++i) {
|
|
|
|
// 2 keys per CF
|
|
|
|
cfs[i] = handles_[i / 2];
|
|
|
|
keys[i] = Slice(key_str[i].data(), key_str[i].size());
|
|
|
|
}
|
|
|
|
ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000};
|
2020-08-07 18:59:19 +00:00
|
|
|
fs->SetDelayTrigger(ro.deadline, ro.io_timeout, 1);
|
2020-04-21 21:49:50 +00:00
|
|
|
statuses = dbfull()->MultiGet(ro, cfs, keys, &values);
|
|
|
|
CheckStatus(statuses, 3);
|
|
|
|
|
|
|
|
// Test batched MultiGet with an IO delay in the first data block read.
|
|
|
|
// Both keys in the first CF should succeed as they're in the same data
|
|
|
|
// block and would form one batch, and we check for deadline between
|
|
|
|
// batches.
|
|
|
|
std::vector<PinnableSlice> pin_values(keys.size());
|
|
|
|
cache->SetCapacity(0);
|
|
|
|
cache->SetCapacity(1048576);
|
|
|
|
statuses.clear();
|
|
|
|
statuses.resize(keys.size());
|
|
|
|
ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000};
|
2020-08-07 18:59:19 +00:00
|
|
|
fs->SetDelayTrigger(ro.deadline, ro.io_timeout, 0);
|
2020-04-21 21:49:50 +00:00
|
|
|
dbfull()->MultiGet(ro, keys.size(), cfs.data(), keys.data(),
|
|
|
|
pin_values.data(), statuses.data());
|
|
|
|
CheckStatus(statuses, 2);
|
|
|
|
|
|
|
|
// Similar to the previous one, but an IO delay in the third CF data block
|
|
|
|
// read
|
|
|
|
for (PinnableSlice& value : pin_values) {
|
|
|
|
value.Reset();
|
|
|
|
}
|
|
|
|
cache->SetCapacity(0);
|
|
|
|
cache->SetCapacity(1048576);
|
|
|
|
statuses.clear();
|
|
|
|
statuses.resize(keys.size());
|
|
|
|
ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000};
|
2020-08-07 18:59:19 +00:00
|
|
|
fs->SetDelayTrigger(ro.deadline, ro.io_timeout, 2);
|
2020-04-21 21:49:50 +00:00
|
|
|
dbfull()->MultiGet(ro, keys.size(), cfs.data(), keys.data(),
|
|
|
|
pin_values.data(), statuses.data());
|
|
|
|
CheckStatus(statuses, 6);
|
|
|
|
|
|
|
|
// Similar to the previous one, but an IO delay in the last but one CF
|
|
|
|
for (PinnableSlice& value : pin_values) {
|
|
|
|
value.Reset();
|
|
|
|
}
|
|
|
|
cache->SetCapacity(0);
|
|
|
|
cache->SetCapacity(1048576);
|
|
|
|
statuses.clear();
|
|
|
|
statuses.resize(keys.size());
|
|
|
|
ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000};
|
2020-08-07 18:59:19 +00:00
|
|
|
fs->SetDelayTrigger(ro.deadline, ro.io_timeout, 3);
|
2020-04-21 21:49:50 +00:00
|
|
|
dbfull()->MultiGet(ro, keys.size(), cfs.data(), keys.data(),
|
|
|
|
pin_values.data(), statuses.data());
|
|
|
|
CheckStatus(statuses, 8);
|
|
|
|
|
|
|
|
// Test batched MultiGet with single CF and lots of keys. Inject delay
|
|
|
|
// into the second batch of keys. As each batch is 32, the first 64 keys,
|
|
|
|
// i.e first two batches, should succeed and the rest should time out
|
|
|
|
for (PinnableSlice& value : pin_values) {
|
|
|
|
value.Reset();
|
|
|
|
}
|
|
|
|
cache->SetCapacity(0);
|
|
|
|
cache->SetCapacity(1048576);
|
|
|
|
key_str.clear();
|
|
|
|
for (i = 0; i < 100; ++i) {
|
|
|
|
key_str.emplace_back(Key(static_cast<int>(i)));
|
|
|
|
}
|
|
|
|
keys.resize(key_str.size());
|
|
|
|
pin_values.clear();
|
|
|
|
pin_values.resize(key_str.size());
|
|
|
|
for (i = 0; i < key_str.size(); ++i) {
|
|
|
|
keys[i] = Slice(key_str[i].data(), key_str[i].size());
|
|
|
|
}
|
|
|
|
statuses.clear();
|
|
|
|
statuses.resize(keys.size());
|
|
|
|
ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000};
|
2020-08-07 18:59:19 +00:00
|
|
|
fs->SetDelayTrigger(ro.deadline, ro.io_timeout, 1);
|
2020-04-21 21:49:50 +00:00
|
|
|
dbfull()->MultiGet(ro, handles_[0], keys.size(), keys.data(),
|
|
|
|
pin_values.data(), statuses.data());
|
|
|
|
CheckStatus(statuses, 64);
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
Multi file concurrency in MultiGet using coroutines and async IO (#9968)
Summary:
This PR implements a coroutine version of batched MultiGet in order to concurrently read from multiple SST files in a level using async IO, thus reducing the latency of the MultiGet. The API from the user perspective is still synchronous and single threaded, with the RocksDB part of the processing happening in the context of the caller's thread. In Version::MultiGet, the decision is made whether to call synchronous or coroutine code.
A good way to review this PR is to review the first 4 commits in order - de773b3, 70c2f70, 10b50e1, and 377a597 - before reviewing the rest.
TODO:
1. Figure out how to build it in CircleCI (requires some dependencies to be installed)
2. Do some stress testing with coroutines enabled
No regression in synchronous MultiGet between this branch and main -
```
./db_bench -use_existing_db=true --db=/data/mysql/rocksdb/prefix_scan -benchmarks="readseq,multireadrandom" -key_size=32 -value_size=512 -num=5000000 -batch_size=64 -multiread_batched=true -use_direct_reads=false -duration=60 -ops_between_duration_checks=1 -readonly=true -adaptive_readahead=true -threads=16 -cache_size=10485760000 -async_io=false -multiread_stride=40000 -statistics
```
Branch - ```multireadrandom : 4.025 micros/op 3975111 ops/sec 60.001 seconds 238509056 operations; 2062.3 MB/s (14767808 of 14767808 found)```
Main - ```multireadrandom : 3.987 micros/op 4013216 ops/sec 60.001 seconds 240795392 operations; 2082.1 MB/s (15231040 of 15231040 found)```
More benchmarks in various scenarios are given below. The measurements were taken with ```async_io=false``` (no coroutines) and ```async_io=true``` (use coroutines). For an IO bound workload (with every key requiring an IO), the coroutines version shows a clear benefit, being ~2.6X faster. For CPU bound workloads, the coroutines version has ~6-15% higher CPU utilization, depending on how many keys overlap an SST file.
1. Single thread IO bound workload on remote storage with sparse MultiGet batch keys (~1 key overlap/file) -
No coroutines - ```multireadrandom : 831.774 micros/op 1202 ops/sec 60.001 seconds 72136 operations; 0.6 MB/s (72136 of 72136 found)```
Using coroutines - ```multireadrandom : 318.742 micros/op 3137 ops/sec 60.003 seconds 188248 operations; 1.6 MB/s (188248 of 188248 found)```
2. Single thread CPU bound workload (all data cached) with ~1 key overlap/file -
No coroutines - ```multireadrandom : 4.127 micros/op 242322 ops/sec 60.000 seconds 14539384 operations; 125.7 MB/s (14539384 of 14539384 found)```
Using coroutines - ```multireadrandom : 4.741 micros/op 210935 ops/sec 60.000 seconds 12656176 operations; 109.4 MB/s (12656176 of 12656176 found)```
3. Single thread CPU bound workload with ~2 key overlap/file -
No coroutines - ```multireadrandom : 3.717 micros/op 269000 ops/sec 60.000 seconds 16140024 operations; 139.6 MB/s (16140024 of 16140024 found)```
Using coroutines - ```multireadrandom : 4.146 micros/op 241204 ops/sec 60.000 seconds 14472296 operations; 125.1 MB/s (14472296 of 14472296 found)```
4. CPU bound multi-threaded (16 threads) with ~4 key overlap/file -
No coroutines - ```multireadrandom : 4.534 micros/op 3528792 ops/sec 60.000 seconds 211728728 operations; 1830.7 MB/s (12737024 of 12737024 found) ```
Using coroutines - ```multireadrandom : 4.872 micros/op 3283812 ops/sec 60.000 seconds 197030096 operations; 1703.6 MB/s (12548032 of 12548032 found) ```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9968
Reviewed By: akankshamahajan15
Differential Revision: D36348563
Pulled By: anand1976
fbshipit-source-id: c0ce85a505fd26ebfbb09786cbd7f25202038696
2022-05-19 22:36:27 +00:00
|
|
|
INSTANTIATE_TEST_CASE_P(DeadlineIO, DBBasicTestMultiGetDeadline,
|
|
|
|
::testing::Bool());
|
|
|
|
|
First step towards handling MANIFEST write error (#6949)
Summary:
This PR provides preliminary support for handling IO error during MANIFEST write.
File write/sync is not guaranteed to be atomic. If we encounter an IOError while writing/syncing to the MANIFEST file, we cannot be sure about the state of the MANIFEST file. The version edits may or may not have reached the file. During cleanup, if we delete the newly-generated SST files referenced by the pending version edit(s), but the version edit(s) actually are persistent in the MANIFEST, then next recovery attempt will process the version edits(s) and then fail since the SST files have already been deleted.
One approach is to truncate the MANIFEST after write/sync error, so that it is safe to delete the SST files. However, file truncation may not be supported on certain file systems. Therefore, we take the following approach.
If an IOError is detected during MANIFEST write/sync, we disable file deletions for the faulty database. Depending on whether the IOError is retryable (set by underlying file system), either RocksDB or application can call `DB::Resume()`, or simply shutdown and restart. During `Resume()`, RocksDB will try to switch to a new MANIFEST and write all existing in-memory version storage in the new file. If this succeeds, then RocksDB may proceed. If all recovery is completed, then file deletions will be re-enabled.
Note that multiple threads can call `LogAndApply()` at the same time, though only one of them will be going through the process MANIFEST write, possibly batching the version edits of other threads. When the leading MANIFEST writer finishes, all of the MANIFEST writing threads in this batch will have the same IOError. They will all call `ErrorHandler::SetBGError()` in which file deletion will be disabled.
Possible future directions:
- Add an `ErrorContext` structure so that it is easier to pass more info to `ErrorHandler`. Currently, as in this example, a new `BackgroundErrorReason` has to be added.
Test plan (dev server):
make check
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6949
Reviewed By: anand1976
Differential Revision: D22026020
Pulled By: riversand963
fbshipit-source-id: f3c68a2ef45d9b505d0d625c7c5e0c88495b91c8
2020-06-25 02:05:47 +00:00
|
|
|
TEST_F(DBBasicTest, ManifestWriteFailure) {
|
|
|
|
Options options = GetDefaultOptions();
|
|
|
|
options.create_if_missing = true;
|
|
|
|
options.disable_auto_compactions = true;
|
|
|
|
options.env = env_;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
ASSERT_OK(Put("foo", "bar"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
SyncPoint::GetInstance()->ClearAllCallBacks();
|
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"VersionSet::ProcessManifestWrites:AfterSyncManifest", [&](void* arg) {
|
|
|
|
ASSERT_NE(nullptr, arg);
|
|
|
|
auto* s = reinterpret_cast<Status*>(arg);
|
|
|
|
ASSERT_OK(*s);
|
|
|
|
// Manually overwrite return status
|
|
|
|
*s = Status::IOError();
|
|
|
|
});
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
ASSERT_OK(Put("key", "value"));
|
|
|
|
ASSERT_NOK(Flush());
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
SyncPoint::GetInstance()->ClearAllCallBacks();
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
Reopen(options);
|
|
|
|
}
|
|
|
|
|
2022-01-06 04:25:57 +00:00
|
|
|
TEST_F(DBBasicTest, DestroyDefaultCfHandle) {
|
|
|
|
Options options = GetDefaultOptions();
|
|
|
|
options.create_if_missing = true;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
CreateAndReopenWithCF({"pikachu"}, options);
|
|
|
|
for (const auto* h : handles_) {
|
|
|
|
ASSERT_NE(db_->DefaultColumnFamily(), h);
|
|
|
|
}
|
|
|
|
|
|
|
|
// We have two handles to the default column family. The two handles point to
|
|
|
|
// different ColumnFamilyHandle objects.
|
|
|
|
assert(db_->DefaultColumnFamily());
|
|
|
|
ASSERT_EQ(0U, db_->DefaultColumnFamily()->GetID());
|
|
|
|
assert(handles_[0]);
|
|
|
|
ASSERT_EQ(0U, handles_[0]->GetID());
|
|
|
|
|
|
|
|
// You can destroy handles_[...].
|
|
|
|
for (auto* h : handles_) {
|
|
|
|
ASSERT_OK(db_->DestroyColumnFamilyHandle(h));
|
|
|
|
}
|
|
|
|
handles_.clear();
|
|
|
|
|
|
|
|
// But you should not destroy db_->DefaultColumnFamily(), since it's going to
|
|
|
|
// be deleted in `DBImpl::CloseHelper()`. Before that, it may be used
|
|
|
|
// elsewhere internally too.
|
|
|
|
ColumnFamilyHandle* default_cf = db_->DefaultColumnFamily();
|
|
|
|
ASSERT_TRUE(db_->DestroyColumnFamilyHandle(default_cf).IsInvalidArgument());
|
|
|
|
}
|
|
|
|
|
2022-05-27 14:23:31 +00:00
|
|
|
TEST_F(DBBasicTest, FailOpenIfLoggerCreationFail) {
|
|
|
|
Options options = GetDefaultOptions();
|
|
|
|
options.create_if_missing = true;
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
SyncPoint::GetInstance()->ClearAllCallBacks();
|
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"rocksdb::CreateLoggerFromOptions:AfterGetPath", [&](void* arg) {
|
|
|
|
auto* s = reinterpret_cast<Status*>(arg);
|
|
|
|
assert(s);
|
|
|
|
*s = Status::IOError("Injected");
|
|
|
|
});
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
Status s = TryReopen(options);
|
|
|
|
ASSERT_EQ(nullptr, options.info_log);
|
2022-06-22 15:26:38 +00:00
|
|
|
ASSERT_TRUE(s.IsIOError());
|
2022-05-27 14:23:31 +00:00
|
|
|
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
SyncPoint::GetInstance()->ClearAllCallBacks();
|
|
|
|
}
|
|
|
|
|
2020-11-04 04:33:45 +00:00
|
|
|
TEST_F(DBBasicTest, VerifyFileChecksums) {
|
|
|
|
Options options = GetDefaultOptions();
|
|
|
|
options.create_if_missing = true;
|
|
|
|
options.env = env_;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
ASSERT_OK(Put("a", "value"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
ASSERT_TRUE(db_->VerifyFileChecksums(ReadOptions()).IsInvalidArgument());
|
|
|
|
|
|
|
|
options.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
|
|
|
|
Reopen(options);
|
|
|
|
ASSERT_OK(db_->VerifyFileChecksums(ReadOptions()));
|
|
|
|
|
|
|
|
// Write an L0 with checksum computed.
|
|
|
|
ASSERT_OK(Put("b", "value"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
ASSERT_OK(db_->VerifyFileChecksums(ReadOptions()));
|
2021-01-04 19:50:29 +00:00
|
|
|
|
|
|
|
// Does the right thing but with the wrong name -- using it should lead to an
|
|
|
|
// error.
|
|
|
|
class MisnamedFileChecksumGenerator : public FileChecksumGenCrc32c {
|
|
|
|
public:
|
|
|
|
MisnamedFileChecksumGenerator(const FileChecksumGenContext& context)
|
|
|
|
: FileChecksumGenCrc32c(context) {}
|
|
|
|
|
|
|
|
const char* Name() const override { return "sha1"; }
|
|
|
|
};
|
|
|
|
|
|
|
|
class MisnamedFileChecksumGenFactory : public FileChecksumGenCrc32cFactory {
|
|
|
|
public:
|
|
|
|
std::unique_ptr<FileChecksumGenerator> CreateFileChecksumGenerator(
|
|
|
|
const FileChecksumGenContext& context) override {
|
|
|
|
return std::unique_ptr<FileChecksumGenerator>(
|
|
|
|
new MisnamedFileChecksumGenerator(context));
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
options.file_checksum_gen_factory.reset(new MisnamedFileChecksumGenFactory());
|
|
|
|
Reopen(options);
|
|
|
|
ASSERT_TRUE(db_->VerifyFileChecksums(ReadOptions()).IsInvalidArgument());
|
2020-11-04 04:33:45 +00:00
|
|
|
}
|
2022-06-03 23:33:00 +00:00
|
|
|
|
2023-04-05 23:22:08 +00:00
|
|
|
TEST_F(DBBasicTest, VerifyFileChecksumsReadahead) {
|
|
|
|
Options options = GetDefaultOptions();
|
|
|
|
options.create_if_missing = true;
|
|
|
|
options.env = env_;
|
|
|
|
options.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
Random rnd(301);
|
|
|
|
int alignment = 256 * 1024;
|
|
|
|
for (int i = 0; i < 16; ++i) {
|
|
|
|
ASSERT_OK(Put("key" + std::to_string(i), rnd.RandomString(alignment)));
|
|
|
|
}
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
|
|
|
|
std::vector<std::string> filenames;
|
|
|
|
int sst_cnt = 0;
|
|
|
|
std::string sst_name;
|
|
|
|
uint64_t sst_size;
|
|
|
|
uint64_t number;
|
|
|
|
FileType type;
|
|
|
|
ASSERT_OK(env_->GetChildren(dbname_, &filenames));
|
|
|
|
for (auto name : filenames) {
|
|
|
|
if (ParseFileName(name, &number, &type)) {
|
|
|
|
if (type == kTableFile) {
|
|
|
|
sst_cnt++;
|
|
|
|
sst_name = name;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
ASSERT_EQ(sst_cnt, 1);
|
|
|
|
ASSERT_OK(env_->GetFileSize(dbname_ + '/' + sst_name, &sst_size));
|
|
|
|
|
|
|
|
bool last_read = false;
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"GenerateOneFileChecksum::Chunk:0", [&](void* /*arg*/) {
|
|
|
|
if (env_->random_read_bytes_counter_.load() == sst_size) {
|
|
|
|
EXPECT_FALSE(last_read);
|
|
|
|
last_read = true;
|
|
|
|
} else {
|
|
|
|
ASSERT_EQ(env_->random_read_bytes_counter_.load() & (alignment - 1),
|
|
|
|
0);
|
|
|
|
}
|
|
|
|
});
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
env_->count_random_reads_ = true;
|
|
|
|
env_->random_read_bytes_counter_ = 0;
|
|
|
|
env_->random_read_counter_.Reset();
|
|
|
|
|
|
|
|
ReadOptions ro;
|
|
|
|
ro.readahead_size = alignment;
|
|
|
|
ASSERT_OK(db_->VerifyFileChecksums(ro));
|
|
|
|
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
ASSERT_TRUE(last_read);
|
|
|
|
ASSERT_EQ(env_->random_read_counter_.Read(),
|
|
|
|
(sst_size + alignment - 1) / (alignment));
|
|
|
|
}
|
|
|
|
|
2022-07-13 00:16:57 +00:00
|
|
|
// TODO: re-enable after we provide finer-grained control for WAL tracking to
|
|
|
|
// meet the needs of different use cases, durability levels and recovery modes.
|
|
|
|
TEST_F(DBBasicTest, DISABLED_ManualWalSync) {
|
2022-06-03 23:33:00 +00:00
|
|
|
Options options = CurrentOptions();
|
|
|
|
options.track_and_verify_wals_in_manifest = true;
|
|
|
|
options.wal_recovery_mode = WALRecoveryMode::kAbsoluteConsistency;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
ASSERT_OK(Put("x", "y"));
|
|
|
|
// This does not create a new WAL.
|
|
|
|
ASSERT_OK(db_->SyncWAL());
|
|
|
|
EXPECT_FALSE(dbfull()->GetVersionSet()->GetWalSet().GetWals().empty());
|
|
|
|
|
|
|
|
std::unique_ptr<LogFile> wal;
|
|
|
|
Status s = db_->GetCurrentWalFile(&wal);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
Close();
|
|
|
|
|
|
|
|
EXPECT_OK(env_->DeleteFile(LogFileName(dbname_, wal->LogNumber())));
|
|
|
|
|
|
|
|
ASSERT_TRUE(TryReopen(options).IsCorruption());
|
|
|
|
}
|
2020-11-04 04:33:45 +00:00
|
|
|
|
2020-08-07 18:59:19 +00:00
|
|
|
// A test class for intercepting random reads and injecting artificial
|
|
|
|
// delays. Used for testing the deadline/timeout feature
|
|
|
|
class DBBasicTestDeadline
|
|
|
|
: public DBBasicTest,
|
|
|
|
public testing::WithParamInterface<std::tuple<bool, bool>> {};
|
|
|
|
|
|
|
|
TEST_P(DBBasicTestDeadline, PointLookupDeadline) {
|
|
|
|
std::shared_ptr<DeadlineFS> fs = std::make_shared<DeadlineFS>(env_, true);
|
2020-06-29 21:51:57 +00:00
|
|
|
std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
|
2020-08-07 18:59:19 +00:00
|
|
|
bool set_deadline = std::get<0>(GetParam());
|
|
|
|
bool set_timeout = std::get<1>(GetParam());
|
2020-06-29 21:51:57 +00:00
|
|
|
|
|
|
|
for (int option_config = kDefault; option_config < kEnd; ++option_config) {
|
|
|
|
if (ShouldSkipOptions(option_config, kSkipPlainTable | kSkipMmapReads)) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
option_config_ = option_config;
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
if (options.use_direct_reads) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
options.env = env.get();
|
|
|
|
options.disable_auto_compactions = true;
|
|
|
|
Cache* block_cache = nullptr;
|
|
|
|
// Fileter block reads currently don't cause the request to get
|
|
|
|
// aborted on a read timeout, so its possible those block reads
|
|
|
|
// may get issued even if the deadline is past
|
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"BlockBasedTable::Get:BeforeFilterMatch",
|
|
|
|
[&](void* /*arg*/) { fs->IgnoreDeadline(true); });
|
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"BlockBasedTable::Get:AfterFilterMatch",
|
|
|
|
[&](void* /*arg*/) { fs->IgnoreDeadline(false); });
|
|
|
|
// DB open will create table readers unless we reduce the table cache
|
|
|
|
// capacity.
|
|
|
|
// SanitizeOptions will set max_open_files to minimum of 20. Table cache
|
|
|
|
// is allocated with max_open_files - 10 as capacity. So override
|
|
|
|
// max_open_files to 11 so table cache capacity will become 1. This will
|
|
|
|
// prevent file open during DB open and force the file to be opened
|
|
|
|
// during MultiGet
|
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
|
|
|
|
int* max_open_files = (int*)arg;
|
|
|
|
*max_open_files = 11;
|
|
|
|
});
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
Fix+clean up handling of mock sleeps (#7101)
Summary:
We have a number of tests hanging on MacOS and windows due to
mishandling of code for mock sleeps. In addition, the code was in
terrible shape because the same variable (addon_time_) would sometimes
refer to microseconds and sometimes to seconds. One test even assumed it
was nanoseconds but was written to pass anyway.
This has been cleaned up so that DB tests generally use a SpecialEnv
function to mock sleep, for either some number of microseconds or seconds
depending on the function called. But to call one of these, the test must first
call SetMockSleep (precondition enforced with assertion), which also turns
sleeps in RocksDB into mock sleeps. To also removes accounting for actual
clock time, call SetTimeElapseOnlySleepOnReopen, which implies
SetMockSleep (on DB re-open). This latter setting only works by applying
on DB re-open, otherwise havoc can ensue if Env goes back in time with
DB open.
More specifics:
Removed some unused test classes, and updated comments on the general
problem.
Fixed DBSSTTest.GetTotalSstFilesSize using a sync point callback instead
of mock time. For this we have the only modification to production code,
inserting a sync point callback in flush_job.cc, which is not a change to
production behavior.
Removed unnecessary resetting of mock times to 0 in many tests. RocksDB
deals in relative time. Any behaviors relying on absolute date/time are likely
a bug. (The above test DBSSTTest.GetTotalSstFilesSize was the only one
clearly injecting a specific absolute time for actual testing convenience.) Just
in case I misunderstood some test, I put this note in each replacement:
// NOTE: Presumed unnecessary and removed: resetting mock time in env
Strengthened some tests like MergeTestTime, MergeCompactionTimeTest, and
FilterCompactionTimeTest in db_test.cc
stats_history_test and blob_db_test are each their own beast, rather deeply
dependent on MockTimeEnv. Each gets its own variant of a work-around for
TimedWait in a mock time environment. (Reduces redundancy and
inconsistency in stats_history_test.)
Intended follow-up:
Remove TimedWait from the public API of InstrumentedCondVar, and only
make that accessible through Env by passing in an InstrumentedCondVar and
a deadline. Then the Env implementations mocking time can fix this problem
without using sync points. (Test infrastructure using sync points interferes
with individual tests' control over sync points.)
With that change, we can simplify/consolidate the scattered work-arounds.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7101
Test Plan: make check on Linux and MacOS
Reviewed By: zhichao-cao
Differential Revision: D23032815
Pulled By: pdillinger
fbshipit-source-id: 7f33967ada8b83011fb54e8279365c008bd6610b
2020-08-11 19:39:49 +00:00
|
|
|
SetTimeElapseOnlySleepOnReopen(&options);
|
2020-06-29 21:51:57 +00:00
|
|
|
Reopen(options);
|
|
|
|
|
2020-09-14 23:59:00 +00:00
|
|
|
if (options.table_factory) {
|
|
|
|
block_cache = options.table_factory->GetOptions<Cache>(
|
|
|
|
TableFactory::kBlockCacheOpts());
|
2020-06-29 21:51:57 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
Random rnd(301);
|
|
|
|
for (int i = 0; i < 400; ++i) {
|
2022-05-06 20:03:58 +00:00
|
|
|
std::string key = "k" + std::to_string(i);
|
2020-09-29 16:47:33 +00:00
|
|
|
ASSERT_OK(Put(key, rnd.RandomString(100)));
|
2020-06-29 21:51:57 +00:00
|
|
|
}
|
2020-12-24 00:54:05 +00:00
|
|
|
ASSERT_OK(Flush());
|
2020-06-29 21:51:57 +00:00
|
|
|
|
|
|
|
bool timedout = true;
|
|
|
|
// A timeout will be forced when the IO counter reaches this value
|
|
|
|
int io_deadline_trigger = 0;
|
|
|
|
// Keep incrementing io_deadline_trigger and call Get() until there is an
|
|
|
|
// iteration that doesn't cause a timeout. This ensures that we cover
|
|
|
|
// all file reads in the point lookup path that can potentially timeout
|
|
|
|
// and cause the Get() to fail.
|
|
|
|
while (timedout) {
|
|
|
|
ReadOptions ro;
|
2020-08-07 18:59:19 +00:00
|
|
|
if (set_deadline) {
|
|
|
|
ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000};
|
|
|
|
}
|
|
|
|
if (set_timeout) {
|
|
|
|
ro.io_timeout = std::chrono::microseconds{5000};
|
|
|
|
}
|
|
|
|
fs->SetDelayTrigger(ro.deadline, ro.io_timeout, io_deadline_trigger);
|
2020-06-29 21:51:57 +00:00
|
|
|
|
|
|
|
block_cache->SetCapacity(0);
|
|
|
|
block_cache->SetCapacity(1048576);
|
|
|
|
|
|
|
|
std::string value;
|
|
|
|
Status s = dbfull()->Get(ro, "k50", &value);
|
|
|
|
if (fs->TimedOut()) {
|
|
|
|
ASSERT_EQ(s, Status::TimedOut());
|
|
|
|
} else {
|
|
|
|
timedout = false;
|
|
|
|
ASSERT_OK(s);
|
|
|
|
}
|
|
|
|
io_deadline_trigger++;
|
|
|
|
}
|
|
|
|
// Reset the delay sequence in order to avoid false alarms during Reopen
|
2020-08-07 18:59:19 +00:00
|
|
|
fs->SetDelayTrigger(std::chrono::microseconds::zero(),
|
|
|
|
std::chrono::microseconds::zero(), 0);
|
|
|
|
}
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(DBBasicTestDeadline, IteratorDeadline) {
|
|
|
|
std::shared_ptr<DeadlineFS> fs = std::make_shared<DeadlineFS>(env_, true);
|
|
|
|
std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
|
|
|
|
bool set_deadline = std::get<0>(GetParam());
|
|
|
|
bool set_timeout = std::get<1>(GetParam());
|
|
|
|
|
|
|
|
for (int option_config = kDefault; option_config < kEnd; ++option_config) {
|
|
|
|
if (ShouldSkipOptions(option_config, kSkipPlainTable | kSkipMmapReads)) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
Options options = CurrentOptions();
|
|
|
|
if (options.use_direct_reads) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
options.env = env.get();
|
|
|
|
options.disable_auto_compactions = true;
|
|
|
|
Cache* block_cache = nullptr;
|
|
|
|
// DB open will create table readers unless we reduce the table cache
|
|
|
|
// capacity.
|
|
|
|
// SanitizeOptions will set max_open_files to minimum of 20. Table cache
|
|
|
|
// is allocated with max_open_files - 10 as capacity. So override
|
|
|
|
// max_open_files to 11 so table cache capacity will become 1. This will
|
|
|
|
// prevent file open during DB open and force the file to be opened
|
|
|
|
// during MultiGet
|
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
|
|
|
|
int* max_open_files = (int*)arg;
|
|
|
|
*max_open_files = 11;
|
|
|
|
});
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
Fix+clean up handling of mock sleeps (#7101)
Summary:
We have a number of tests hanging on MacOS and windows due to
mishandling of code for mock sleeps. In addition, the code was in
terrible shape because the same variable (addon_time_) would sometimes
refer to microseconds and sometimes to seconds. One test even assumed it
was nanoseconds but was written to pass anyway.
This has been cleaned up so that DB tests generally use a SpecialEnv
function to mock sleep, for either some number of microseconds or seconds
depending on the function called. But to call one of these, the test must first
call SetMockSleep (precondition enforced with assertion), which also turns
sleeps in RocksDB into mock sleeps. To also removes accounting for actual
clock time, call SetTimeElapseOnlySleepOnReopen, which implies
SetMockSleep (on DB re-open). This latter setting only works by applying
on DB re-open, otherwise havoc can ensue if Env goes back in time with
DB open.
More specifics:
Removed some unused test classes, and updated comments on the general
problem.
Fixed DBSSTTest.GetTotalSstFilesSize using a sync point callback instead
of mock time. For this we have the only modification to production code,
inserting a sync point callback in flush_job.cc, which is not a change to
production behavior.
Removed unnecessary resetting of mock times to 0 in many tests. RocksDB
deals in relative time. Any behaviors relying on absolute date/time are likely
a bug. (The above test DBSSTTest.GetTotalSstFilesSize was the only one
clearly injecting a specific absolute time for actual testing convenience.) Just
in case I misunderstood some test, I put this note in each replacement:
// NOTE: Presumed unnecessary and removed: resetting mock time in env
Strengthened some tests like MergeTestTime, MergeCompactionTimeTest, and
FilterCompactionTimeTest in db_test.cc
stats_history_test and blob_db_test are each their own beast, rather deeply
dependent on MockTimeEnv. Each gets its own variant of a work-around for
TimedWait in a mock time environment. (Reduces redundancy and
inconsistency in stats_history_test.)
Intended follow-up:
Remove TimedWait from the public API of InstrumentedCondVar, and only
make that accessible through Env by passing in an InstrumentedCondVar and
a deadline. Then the Env implementations mocking time can fix this problem
without using sync points. (Test infrastructure using sync points interferes
with individual tests' control over sync points.)
With that change, we can simplify/consolidate the scattered work-arounds.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7101
Test Plan: make check on Linux and MacOS
Reviewed By: zhichao-cao
Differential Revision: D23032815
Pulled By: pdillinger
fbshipit-source-id: 7f33967ada8b83011fb54e8279365c008bd6610b
2020-08-11 19:39:49 +00:00
|
|
|
SetTimeElapseOnlySleepOnReopen(&options);
|
2020-08-07 18:59:19 +00:00
|
|
|
Reopen(options);
|
|
|
|
|
2020-09-14 23:59:00 +00:00
|
|
|
if (options.table_factory) {
|
|
|
|
block_cache = options.table_factory->GetOptions<Cache>(
|
|
|
|
TableFactory::kBlockCacheOpts());
|
2020-08-07 18:59:19 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
Random rnd(301);
|
|
|
|
for (int i = 0; i < 400; ++i) {
|
2022-05-06 20:03:58 +00:00
|
|
|
std::string key = "k" + std::to_string(i);
|
2020-09-29 16:47:33 +00:00
|
|
|
ASSERT_OK(Put(key, rnd.RandomString(100)));
|
2020-08-07 18:59:19 +00:00
|
|
|
}
|
2020-09-29 16:47:33 +00:00
|
|
|
ASSERT_OK(Flush());
|
2020-08-07 18:59:19 +00:00
|
|
|
|
|
|
|
bool timedout = true;
|
|
|
|
// A timeout will be forced when the IO counter reaches this value
|
|
|
|
int io_deadline_trigger = 0;
|
|
|
|
// Keep incrementing io_deadline_trigger and call Get() until there is an
|
|
|
|
// iteration that doesn't cause a timeout. This ensures that we cover
|
|
|
|
// all file reads in the point lookup path that can potentially timeout
|
|
|
|
while (timedout) {
|
|
|
|
ReadOptions ro;
|
|
|
|
if (set_deadline) {
|
|
|
|
ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000};
|
|
|
|
}
|
|
|
|
if (set_timeout) {
|
|
|
|
ro.io_timeout = std::chrono::microseconds{5000};
|
|
|
|
}
|
|
|
|
fs->SetDelayTrigger(ro.deadline, ro.io_timeout, io_deadline_trigger);
|
|
|
|
|
|
|
|
block_cache->SetCapacity(0);
|
|
|
|
block_cache->SetCapacity(1048576);
|
|
|
|
|
|
|
|
Iterator* iter = dbfull()->NewIterator(ro);
|
|
|
|
int count = 0;
|
|
|
|
iter->Seek("k50");
|
|
|
|
while (iter->Valid() && count++ < 100) {
|
|
|
|
iter->Next();
|
|
|
|
}
|
|
|
|
if (fs->TimedOut()) {
|
|
|
|
ASSERT_FALSE(iter->Valid());
|
|
|
|
ASSERT_EQ(iter->status(), Status::TimedOut());
|
|
|
|
} else {
|
|
|
|
timedout = false;
|
|
|
|
ASSERT_OK(iter->status());
|
|
|
|
}
|
|
|
|
delete iter;
|
|
|
|
io_deadline_trigger++;
|
|
|
|
}
|
|
|
|
// Reset the delay sequence in order to avoid false alarms during Reopen
|
|
|
|
fs->SetDelayTrigger(std::chrono::microseconds::zero(),
|
|
|
|
std::chrono::microseconds::zero(), 0);
|
2020-06-29 21:51:57 +00:00
|
|
|
}
|
|
|
|
Close();
|
|
|
|
}
|
|
|
|
|
2020-08-07 18:59:19 +00:00
|
|
|
// Param 0: If true, set read_options.deadline
|
|
|
|
// Param 1: If true, set read_options.io_timeout
|
|
|
|
INSTANTIATE_TEST_CASE_P(DBBasicTestDeadline, DBBasicTestDeadline,
|
|
|
|
::testing::Values(std::make_tuple(true, false),
|
|
|
|
std::make_tuple(false, true),
|
|
|
|
std::make_tuple(true, true)));
|
2020-02-20 20:07:53 +00:00
|
|
|
} // namespace ROCKSDB_NAMESPACE
|
2017-02-27 20:19:23 +00:00
|
|
|
|
|
|
|
int main(int argc, char** argv) {
|
2020-02-20 20:07:53 +00:00
|
|
|
ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
|
2017-02-27 20:19:23 +00:00
|
|
|
::testing::InitGoogleTest(&argc, argv);
|
2019-08-09 22:08:36 +00:00
|
|
|
RegisterCustomObjects(argc, argv);
|
2017-02-27 20:19:23 +00:00
|
|
|
return RUN_ALL_TESTS();
|
|
|
|
}
|