2016-02-09 23:12:00 +00:00
|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
2017-07-15 23:03:42 +00:00
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
2013-10-16 21:59:46 +00:00
|
|
|
//
|
2011-03-18 22:37:00 +00:00
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
|
2019-05-30 18:21:38 +00:00
|
|
|
#include "test_util/testutil.h"
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2020-04-25 06:58:13 +00:00
|
|
|
#include <fcntl.h>
|
2020-06-05 18:06:26 +00:00
|
|
|
#include <sys/stat.h>
|
2020-07-09 21:33:42 +00:00
|
|
|
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-25 03:50:35 +00:00
|
|
|
#include <array>
|
Add OptionsUtil::LoadOptionsFromFile() API
Summary:
This patch adds OptionsUtil::LoadOptionsFromFile() and
OptionsUtil::LoadLatestOptionsFromDB(), which allow developers
to construct DBOptions and ColumnFamilyOptions from a RocksDB
options file. Note that most pointer-typed options such as
merge_operator will not be constructed.
With this API, developers no longer need to remember all the
options in order to reopen an existing rocksdb instance like
the following:
DBOptions db_options;
std::vector<std::string> cf_names;
std::vector<ColumnFamilyOptions> cf_opts;
// Load primitive-typed options from an existing DB
OptionsUtil::LoadLatestOptionsFromDB(
dbname, &db_options, &cf_names, &cf_opts);
// Initialize necessary pointer-typed options
cf_opts[0].merge_operator.reset(new MyMergeOperator());
...
// Construct the vector of ColumnFamilyDescriptor
std::vector<ColumnFamilyDescriptor> cf_descs;
for (size_t i = 0; i < cf_opts.size(); ++i) {
cf_descs.emplace_back(cf_names[i], cf_opts[i]);
}
// Open the DB
DB* db = nullptr;
std::vector<ColumnFamilyHandle*> cf_handles;
auto s = DB::Open(db_options, dbname, cf_descs,
&handles, &db);
Test Plan:
Augment existing tests in column_family_test
options_test
db_test
Reviewers: igor, IslamAbdelRahman, sdong, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D49095
2015-11-12 14:52:43 +00:00
|
|
|
#include <cctype>
|
2019-07-09 21:48:07 +00:00
|
|
|
#include <fstream>
|
Add OptionsUtil::LoadOptionsFromFile() API
Summary:
This patch adds OptionsUtil::LoadOptionsFromFile() and
OptionsUtil::LoadLatestOptionsFromDB(), which allow developers
to construct DBOptions and ColumnFamilyOptions from a RocksDB
options file. Note that most pointer-typed options such as
merge_operator will not be constructed.
With this API, developers no longer need to remember all the
options in order to reopen an existing rocksdb instance like
the following:
DBOptions db_options;
std::vector<std::string> cf_names;
std::vector<ColumnFamilyOptions> cf_opts;
// Load primitive-typed options from an existing DB
OptionsUtil::LoadLatestOptionsFromDB(
dbname, &db_options, &cf_names, &cf_opts);
// Initialize necessary pointer-typed options
cf_opts[0].merge_operator.reset(new MyMergeOperator());
...
// Construct the vector of ColumnFamilyDescriptor
std::vector<ColumnFamilyDescriptor> cf_descs;
for (size_t i = 0; i < cf_opts.size(); ++i) {
cf_descs.emplace_back(cf_names[i], cf_opts[i]);
}
// Open the DB
DB* db = nullptr;
std::vector<ColumnFamilyHandle*> cf_handles;
auto s = DB::Open(db_options, dbname, cf_descs,
&handles, &db);
Test Plan:
Augment existing tests in column_family_test
options_test
db_test
Reviewers: igor, IslamAbdelRahman, sdong, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D49095
2015-11-12 14:52:43 +00:00
|
|
|
#include <sstream>
|
|
|
|
|
2017-03-13 18:44:50 +00:00
|
|
|
#include "db/memtable_list.h"
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
#include "env/composite_env_wrapper.h"
|
2019-09-16 17:31:27 +00:00
|
|
|
#include "file/random_access_file_reader.h"
|
|
|
|
#include "file/sequence_file_reader.h"
|
|
|
|
#include "file/writable_file_writer.h"
|
2014-08-27 17:39:31 +00:00
|
|
|
#include "port/port.h"
|
2020-09-14 23:59:00 +00:00
|
|
|
#include "rocksdb/convenience.h"
|
2021-01-26 06:07:26 +00:00
|
|
|
#include "rocksdb/system_clock.h"
|
2021-08-19 17:09:30 +00:00
|
|
|
#include "rocksdb/utilities/object_registry.h"
|
2021-09-21 15:53:03 +00:00
|
|
|
#include "test_util/mock_time_env.h"
|
2020-04-25 06:58:13 +00:00
|
|
|
#include "test_util/sync_point.h"
|
2020-07-09 21:33:42 +00:00
|
|
|
#include "util/random.h"
|
2021-11-08 19:04:01 +00:00
|
|
|
|
|
|
|
#ifndef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS
|
|
|
|
void RegisterCustomObjects(int /*argc*/, char** /*argv*/) {}
|
|
|
|
#endif
|
2011-03-18 22:37:00 +00:00
|
|
|
|
2020-02-20 20:07:53 +00:00
|
|
|
namespace ROCKSDB_NAMESPACE {
|
2011-03-18 22:37:00 +00:00
|
|
|
namespace test {
|
|
|
|
|
2018-06-05 02:59:44 +00:00
|
|
|
const uint32_t kDefaultFormatVersion = BlockBasedTableOptions().format_version;
|
2021-12-10 16:12:09 +00:00
|
|
|
const std::set<uint32_t> kFooterFormatVersionsToTest{
|
format_version=6 and context-aware block checksums (#9058)
Summary:
## Context checksum
All RocksDB checksums currently use 32 bits of checking
power, which should be 1 in 4 billion false negative (FN) probability (failing to
detect corruption). This is true for random corruptions, and in some cases
small corruptions are guaranteed to be detected. But some possible
corruptions, such as in storage metadata rather than storage payload data,
would have a much higher FN rate. For example:
* Data larger than one SST block is replaced by data from elsewhere in
the same or another SST file. Especially with block_align=true, the
probability of exact block size match is probably around 1 in 100, making
the FN probability around that same. Without `block_align=true` the
probability of same block start location is probably around 1 in 10,000,
for FN probability around 1 in a million.
To solve this problem in new format_version=6, we add "context awareness"
to block checksum checks. The stored and expected checksum value is
modified based on the block's position in the file and which file it is in. The
modifications are cleverly chosen so that, for example
* blocks within about 4GB of each other are guaranteed to use different context
* blocks that are offset by exactly some multiple of 4GiB are guaranteed to use
different context
* files generated by the same process are guaranteed to use different context
for the same offsets, until wrap-around after 2^32 - 1 files
Thus, with format_version=6, if a valid SST block and checksum is misplaced,
its checksum FN probability should be essentially ideal, 1 in 4B.
## Footer checksum
This change also adds checksum protection to the SST footer (with
format_version=6), for the first time without relying on whole file checksum.
To prevent a corruption of the format_version in the footer (e.g. 6 -> 5) to
defeat the footer checksum, we change much of the footer data format
including an "extended magic number" in format_version 6 that would be
interpreted as empty index and metaindex block handles in older footer
versions. We also change the encoding of handles to free up space for
other new data in footer.
## More detail: making space in footer
In order to keep footer the same size in format_version=6 (avoid change to IO
patterns), we have to free up some space for new data. We do this two ways:
* Metaindex block handle is encoded down to 4 bytes (from 10) by assuming
it immediately precedes the footer, and by assuming it is < 4GB.
* Index block handle is moved into metaindex. (I don't know why it was
in footer to begin with.)
## Performance
In case of small performance penalty, I've made a "pay as you go" optimization
to compensate: replace `MutableCFOptions` in BlockBasedTableBuilder::Rep
with the only field used in that structure after construction: `prefix_extractor`.
This makes the PR an overall performance improvement (results below).
Nevertheless I'm seeing essentially no difference going from fv=5 to fv=6,
even including that improvement for both. That's based on extreme case table
write performance testing, many files with many blocks. This is relatively
checksum intensive (small blocks) and salt generation intensive (small files).
```
(for I in `seq 1 100`; do TEST_TMPDIR=/dev/shm/dbbench2 ./db_bench -benchmarks=fillseq -memtablerep=vector -disable_wal=1 -allow_concurrent_memtable_write=false -num=3000000 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=0 -write_buffer_size=100000 -compression_type=none -block_size=1000; done) 2>&1 | grep micros/op | tee out
awk '{ tot += $5; n += 1; } END { print int(1.0 * tot / n) }' < out
```
Each value below is ops/s averaged over 100 runs, run simultaneously with competing
configuration for load fairness
Before -> after (both fv=5): 483530 -> 483673 (negligible)
Re-run 1: 480733 -> 485427 (1.0% faster)
Re-run 2: 483821 -> 484541 (0.1% faster)
Before (fv=5) -> after (fv=6): 482006 -> 485100 (0.6% faster)
Re-run 1: 482212 -> 485075 (0.6% faster)
Re-run 2: 483590 -> 484073 (0.1% faster)
After fv=5 -> after fv=6: 483878 -> 485542 (0.3% faster)
Re-run 1: 485331 -> 483385 (0.4% slower)
Re-run 2: 485283 -> 483435 (0.4% slower)
Re-run 3: 483647 -> 486109 (0.5% faster)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9058
Test Plan:
unit tests included (table_test, db_properties_test, salt in env_test). General DB tests
and crash test updated to test new format_version.
Also temporarily updated the default format version to 6 and saw some test failures. Almost all
were due to an inadvertent additional read in VerifyChecksum to verify the index block checksum,
though it's arguably a bug that VerifyChecksum does not appear to (re-)verify the index block
checksum, just assuming it was verified in opening the index reader (probably *usually* true but
probably not always true). Some other concerns about VerifyChecksum are left in FIXME
comments. The only remaining test failure on change of default (in block_fetcher_test) now
has a comment about how to upgrade the test.
The format compatibility test does not need updating because we have not updated the default
format_version.
Reviewed By: ajkr, mrambacher
Differential Revision: D33100915
Pulled By: pdillinger
fbshipit-source-id: 8679e3e572fa580181a737fd6d113ed53c5422ee
2023-07-30 23:40:01 +00:00
|
|
|
// Non-legacy, before big footer changes
|
2021-12-10 16:12:09 +00:00
|
|
|
5U,
|
format_version=6 and context-aware block checksums (#9058)
Summary:
## Context checksum
All RocksDB checksums currently use 32 bits of checking
power, which should be 1 in 4 billion false negative (FN) probability (failing to
detect corruption). This is true for random corruptions, and in some cases
small corruptions are guaranteed to be detected. But some possible
corruptions, such as in storage metadata rather than storage payload data,
would have a much higher FN rate. For example:
* Data larger than one SST block is replaced by data from elsewhere in
the same or another SST file. Especially with block_align=true, the
probability of exact block size match is probably around 1 in 100, making
the FN probability around that same. Without `block_align=true` the
probability of same block start location is probably around 1 in 10,000,
for FN probability around 1 in a million.
To solve this problem in new format_version=6, we add "context awareness"
to block checksum checks. The stored and expected checksum value is
modified based on the block's position in the file and which file it is in. The
modifications are cleverly chosen so that, for example
* blocks within about 4GB of each other are guaranteed to use different context
* blocks that are offset by exactly some multiple of 4GiB are guaranteed to use
different context
* files generated by the same process are guaranteed to use different context
for the same offsets, until wrap-around after 2^32 - 1 files
Thus, with format_version=6, if a valid SST block and checksum is misplaced,
its checksum FN probability should be essentially ideal, 1 in 4B.
## Footer checksum
This change also adds checksum protection to the SST footer (with
format_version=6), for the first time without relying on whole file checksum.
To prevent a corruption of the format_version in the footer (e.g. 6 -> 5) to
defeat the footer checksum, we change much of the footer data format
including an "extended magic number" in format_version 6 that would be
interpreted as empty index and metaindex block handles in older footer
versions. We also change the encoding of handles to free up space for
other new data in footer.
## More detail: making space in footer
In order to keep footer the same size in format_version=6 (avoid change to IO
patterns), we have to free up some space for new data. We do this two ways:
* Metaindex block handle is encoded down to 4 bytes (from 10) by assuming
it immediately precedes the footer, and by assuming it is < 4GB.
* Index block handle is moved into metaindex. (I don't know why it was
in footer to begin with.)
## Performance
In case of small performance penalty, I've made a "pay as you go" optimization
to compensate: replace `MutableCFOptions` in BlockBasedTableBuilder::Rep
with the only field used in that structure after construction: `prefix_extractor`.
This makes the PR an overall performance improvement (results below).
Nevertheless I'm seeing essentially no difference going from fv=5 to fv=6,
even including that improvement for both. That's based on extreme case table
write performance testing, many files with many blocks. This is relatively
checksum intensive (small blocks) and salt generation intensive (small files).
```
(for I in `seq 1 100`; do TEST_TMPDIR=/dev/shm/dbbench2 ./db_bench -benchmarks=fillseq -memtablerep=vector -disable_wal=1 -allow_concurrent_memtable_write=false -num=3000000 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=0 -write_buffer_size=100000 -compression_type=none -block_size=1000; done) 2>&1 | grep micros/op | tee out
awk '{ tot += $5; n += 1; } END { print int(1.0 * tot / n) }' < out
```
Each value below is ops/s averaged over 100 runs, run simultaneously with competing
configuration for load fairness
Before -> after (both fv=5): 483530 -> 483673 (negligible)
Re-run 1: 480733 -> 485427 (1.0% faster)
Re-run 2: 483821 -> 484541 (0.1% faster)
Before (fv=5) -> after (fv=6): 482006 -> 485100 (0.6% faster)
Re-run 1: 482212 -> 485075 (0.6% faster)
Re-run 2: 483590 -> 484073 (0.1% faster)
After fv=5 -> after fv=6: 483878 -> 485542 (0.3% faster)
Re-run 1: 485331 -> 483385 (0.4% slower)
Re-run 2: 485283 -> 483435 (0.4% slower)
Re-run 3: 483647 -> 486109 (0.5% faster)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9058
Test Plan:
unit tests included (table_test, db_properties_test, salt in env_test). General DB tests
and crash test updated to test new format_version.
Also temporarily updated the default format version to 6 and saw some test failures. Almost all
were due to an inadvertent additional read in VerifyChecksum to verify the index block checksum,
though it's arguably a bug that VerifyChecksum does not appear to (re-)verify the index block
checksum, just assuming it was verified in opening the index reader (probably *usually* true but
probably not always true). Some other concerns about VerifyChecksum are left in FIXME
comments. The only remaining test failure on change of default (in block_fetcher_test) now
has a comment about how to upgrade the test.
The format compatibility test does not need updating because we have not updated the default
format_version.
Reviewed By: ajkr, mrambacher
Differential Revision: D33100915
Pulled By: pdillinger
fbshipit-source-id: 8679e3e572fa580181a737fd6d113ed53c5422ee
2023-07-30 23:40:01 +00:00
|
|
|
// After big footer changes
|
|
|
|
6U,
|
2021-12-10 16:12:09 +00:00
|
|
|
// In case any interesting future changes
|
|
|
|
kDefaultFormatVersion,
|
|
|
|
kLatestFormatVersion,
|
|
|
|
};
|
2018-06-05 02:59:44 +00:00
|
|
|
|
2015-10-13 21:24:45 +00:00
|
|
|
std::string RandomKey(Random* rnd, int len, RandomKeyType type) {
|
2011-03-18 22:37:00 +00:00
|
|
|
// Make sure to generate a wide variety of characters so we
|
|
|
|
// test the boundary conditions for short-key optimizations.
|
2018-04-13 00:55:14 +00:00
|
|
|
static const char kTestChars[] = {'\0', '\1', 'a', 'b', 'c',
|
|
|
|
'd', 'e', '\xfd', '\xfe', '\xff'};
|
2011-03-18 22:37:00 +00:00
|
|
|
std::string result;
|
|
|
|
for (int i = 0; i < len; i++) {
|
2015-10-13 21:24:45 +00:00
|
|
|
std::size_t indx = 0;
|
|
|
|
switch (type) {
|
|
|
|
case RandomKeyType::RANDOM:
|
|
|
|
indx = rnd->Uniform(sizeof(kTestChars));
|
|
|
|
break;
|
|
|
|
case RandomKeyType::LARGEST:
|
|
|
|
indx = sizeof(kTestChars) - 1;
|
|
|
|
break;
|
|
|
|
case RandomKeyType::MIDDLE:
|
|
|
|
indx = sizeof(kTestChars) / 2;
|
|
|
|
break;
|
|
|
|
case RandomKeyType::SMALLEST:
|
|
|
|
indx = 0;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
result += kTestChars[indx];
|
2011-03-18 22:37:00 +00:00
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2023-05-25 22:41:32 +00:00
|
|
|
const std::vector<UserDefinedTimestampTestMode>& GetUDTTestModes() {
|
|
|
|
static std::vector<UserDefinedTimestampTestMode> udt_test_modes = {
|
|
|
|
UserDefinedTimestampTestMode::kStripUserDefinedTimestamp,
|
|
|
|
UserDefinedTimestampTestMode::kNormal,
|
|
|
|
UserDefinedTimestampTestMode::kNone};
|
|
|
|
return udt_test_modes;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool IsUDTEnabled(const UserDefinedTimestampTestMode& test_mode) {
|
|
|
|
return test_mode != UserDefinedTimestampTestMode::kNone;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool ShouldPersistUDT(const UserDefinedTimestampTestMode& test_mode) {
|
|
|
|
return test_mode != UserDefinedTimestampTestMode::kStripUserDefinedTimestamp;
|
|
|
|
}
|
|
|
|
|
2011-03-18 22:37:00 +00:00
|
|
|
extern Slice CompressibleString(Random* rnd, double compressed_fraction,
|
|
|
|
int len, std::string* dst) {
|
|
|
|
int raw = static_cast<int>(len * compressed_fraction);
|
2023-12-01 19:15:17 +00:00
|
|
|
if (raw < 1) {
|
|
|
|
raw = 1;
|
|
|
|
}
|
2023-04-22 19:41:36 +00:00
|
|
|
std::string raw_data = rnd->RandomBinaryString(raw);
|
2011-03-18 22:37:00 +00:00
|
|
|
|
|
|
|
// Duplicate the random data until we have filled "len" bytes
|
|
|
|
dst->clear();
|
2012-11-06 20:02:18 +00:00
|
|
|
while (dst->size() < (unsigned int)len) {
|
2011-03-18 22:37:00 +00:00
|
|
|
dst->append(raw_data);
|
|
|
|
}
|
|
|
|
dst->resize(len);
|
|
|
|
return Slice(*dst);
|
|
|
|
}
|
|
|
|
|
2014-08-27 17:39:31 +00:00
|
|
|
namespace {
|
|
|
|
class Uint64ComparatorImpl : public Comparator {
|
|
|
|
public:
|
2023-12-01 19:15:17 +00:00
|
|
|
Uint64ComparatorImpl() = default;
|
2014-08-27 17:39:31 +00:00
|
|
|
|
2019-02-14 21:52:47 +00:00
|
|
|
const char* Name() const override { return "rocksdb.Uint64Comparator"; }
|
2014-08-27 17:39:31 +00:00
|
|
|
|
2019-02-14 21:52:47 +00:00
|
|
|
int Compare(const Slice& a, const Slice& b) const override {
|
2014-08-27 17:39:31 +00:00
|
|
|
assert(a.size() == sizeof(uint64_t) && b.size() == sizeof(uint64_t));
|
|
|
|
const uint64_t* left = reinterpret_cast<const uint64_t*>(a.data());
|
|
|
|
const uint64_t* right = reinterpret_cast<const uint64_t*>(b.data());
|
2017-04-22 03:41:37 +00:00
|
|
|
uint64_t leftValue;
|
|
|
|
uint64_t rightValue;
|
|
|
|
GetUnaligned(left, &leftValue);
|
|
|
|
GetUnaligned(right, &rightValue);
|
|
|
|
if (leftValue == rightValue) {
|
2014-08-27 17:39:31 +00:00
|
|
|
return 0;
|
2017-04-22 03:41:37 +00:00
|
|
|
} else if (leftValue < rightValue) {
|
2014-08-27 17:39:31 +00:00
|
|
|
return -1;
|
|
|
|
} else {
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-02-14 21:52:47 +00:00
|
|
|
void FindShortestSeparator(std::string* /*start*/,
|
2023-12-01 19:15:17 +00:00
|
|
|
const Slice& /*limit*/) const override {}
|
2014-08-27 17:39:31 +00:00
|
|
|
|
2023-12-01 19:15:17 +00:00
|
|
|
void FindShortSuccessor(std::string* /*key*/) const override {}
|
2014-08-27 17:39:31 +00:00
|
|
|
};
|
|
|
|
} // namespace
|
|
|
|
|
|
|
|
const Comparator* Uint64Comparator() {
|
2018-06-05 02:59:44 +00:00
|
|
|
static Uint64ComparatorImpl uint64comp;
|
|
|
|
return &uint64comp;
|
2014-08-27 17:39:31 +00:00
|
|
|
}
|
|
|
|
|
2022-02-08 20:14:25 +00:00
|
|
|
const Comparator* BytewiseComparatorWithU64TsWrapper() {
|
|
|
|
ConfigOptions config_options;
|
|
|
|
const Comparator* user_comparator = nullptr;
|
|
|
|
Status s = Comparator::CreateFromString(
|
|
|
|
config_options, "leveldb.BytewiseComparator.u64ts", &user_comparator);
|
|
|
|
s.PermitUncheckedError();
|
|
|
|
return user_comparator;
|
2020-04-10 23:03:33 +00:00
|
|
|
}
|
|
|
|
|
2023-07-27 22:31:22 +00:00
|
|
|
const Comparator* ReverseBytewiseComparatorWithU64TsWrapper() {
|
|
|
|
ConfigOptions config_options;
|
|
|
|
const Comparator* user_comparator = nullptr;
|
|
|
|
Status s = Comparator::CreateFromString(
|
|
|
|
config_options, "rocksdb.ReverseBytewiseComparator.u64ts",
|
|
|
|
&user_comparator);
|
|
|
|
s.PermitUncheckedError();
|
|
|
|
return user_comparator;
|
|
|
|
}
|
|
|
|
|
Simplify querying of merge results
Summary:
While working on supporting mixing merge operators with
single deletes ( https://reviews.facebook.net/D43179 ),
I realized that returning and dealing with merge results
can be made simpler. Submitting this as a separate diff
because it is not directly related to single deletes.
Before, callers of merge helper had to retrieve the merge
result in one of two ways depending on whether the merge
was successful or not (success = result of merge was single
kTypeValue). For successful merges, the caller could query
the resulting key/value pair and for unsuccessful merges,
the result could be retrieved in the form of two deques of
keys and values. However, with single deletes, a successful merge
does not return a single key/value pair (if merge
operands are merged with a single delete, we have to generate
a value and keep the original single delete around to make
sure that we are not accidentially producing a key overwrite).
In addition, the two existing call sites of the merge
helper were taking the same actions independently from whether
the merge was successful or not, so this patch simplifies that.
Test Plan: make clean all check
Reviewers: rven, sdong, yhchiang, anthony, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43353
2015-08-18 00:34:38 +00:00
|
|
|
void CorruptKeyType(InternalKey* ikey) {
|
|
|
|
std::string keystr = ikey->Encode().ToString();
|
|
|
|
keystr[keystr.size() - 8] = kTypeLogData;
|
|
|
|
ikey->DecodeFrom(Slice(keystr.data(), keystr.size()));
|
|
|
|
}
|
|
|
|
|
Support for SingleDelete()
Summary:
This patch fixes #7460559. It introduces SingleDelete as a new database
operation. This operation can be used to delete keys that were never
overwritten (no put following another put of the same key). If an overwritten
key is single deleted the behavior is undefined. Single deletion of a
non-existent key has no effect but multiple consecutive single deletions are
not allowed (see limitations).
In contrast to the conventional Delete() operation, the deletion entry is
removed along with the value when the two are lined up in a compaction. Note:
The semantics are similar to @igor's prototype that allowed to have this
behavior on the granularity of a column family (
https://reviews.facebook.net/D42093 ). This new patch, however, is more
aggressive when it comes to removing tombstones: It removes the SingleDelete
together with the value whenever there is no snapshot between them while the
older patch only did this when the sequence number of the deletion was older
than the earliest snapshot.
Most of the complex additions are in the Compaction Iterator, all other changes
should be relatively straightforward. The patch also includes basic support for
single deletions in db_stress and db_bench.
Limitations:
- Not compatible with cuckoo hash tables
- Single deletions cannot be used in combination with merges and normal
deletions on the same key (other keys are not affected by this)
- Consecutive single deletions are currently not allowed (and older version of
this patch supported this so it could be resurrected if needed)
Test Plan: make all check
Reviewers: yhchiang, sdong, rven, anthony, yoshinorim, igor
Reviewed By: igor
Subscribers: maykov, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D43179
2015-09-17 18:42:56 +00:00
|
|
|
std::string KeyStr(const std::string& user_key, const SequenceNumber& seq,
|
|
|
|
const ValueType& t, bool corrupt) {
|
|
|
|
InternalKey k(user_key, seq, t);
|
|
|
|
if (corrupt) {
|
|
|
|
CorruptKeyType(&k);
|
|
|
|
}
|
|
|
|
return k.Encode().ToString();
|
|
|
|
}
|
|
|
|
|
Allow compaction iterator to perform garbage collection (#7556)
Summary:
Add a threshold timestamp, full_history_ts_low_ of type `std::string*` to
`CompactionIterator`, so that RocksDB can also perform garbage collection during
compaction.
* If full_history_ts_low_ is nullptr, then compaction iterator does not perform
GC, preserving all timestamp history for all keys. Compaction iterator will
treat user key with different timestamps as different user keys.
* If full_history_ts_low_ is not nullptr, then compaction iterator performs
GC. GC will look at keys older than `*full_history_ts_low_` and determine their
eligibility based on factors including snapshots.
Current rules of GC:
* If an internal key is in the same snapshot as a previous counterpart
with the same user key, and this key is eligible for GC, and the key is
not single-delete or merge operand, then this key can be dropped. Note
that the previous internal key cannot be a merge operand either.
* If a tombstone is the most recent one in the earliest snapshot and it
is eligible for GC, and keyNotExistsBeyondLevel() is true, then this
tombstone can be dropped.
* If a tombstone is the most recent one in a snapshot and it is eligible
for GC, and the compaction is at bottommost level, then all other older
internal keys of the same user key must also be eligible for GC, thus
can be dropped
* Single-delete, delete-range and merge are not currently supported.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7556
Test Plan: make check
Reviewed By: ltamasi
Differential Revision: D24507728
Pulled By: riversand963
fbshipit-source-id: 3c09c7301f41eed76dfcf4d1527e68cf6e0a8bb3
2020-10-24 05:58:05 +00:00
|
|
|
std::string KeyStr(uint64_t ts, const std::string& user_key,
|
|
|
|
const SequenceNumber& seq, const ValueType& t,
|
|
|
|
bool corrupt) {
|
|
|
|
std::string user_key_with_ts(user_key);
|
|
|
|
std::string ts_str;
|
|
|
|
PutFixed64(&ts_str, ts);
|
|
|
|
user_key_with_ts.append(ts_str);
|
|
|
|
return KeyStr(user_key_with_ts, seq, t, corrupt);
|
|
|
|
}
|
|
|
|
|
2021-01-26 06:07:26 +00:00
|
|
|
bool SleepingBackgroundTask::TimedWaitUntilSleeping(uint64_t wait_time) {
|
|
|
|
auto abs_time = SystemClock::Default()->NowMicros() + wait_time;
|
|
|
|
MutexLock l(&mutex_);
|
|
|
|
while (!sleeping_ || !should_sleep_) {
|
|
|
|
if (bg_cv_.TimedWait(abs_time)) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool SleepingBackgroundTask::TimedWaitUntilDone(uint64_t wait_time) {
|
|
|
|
auto abs_time = SystemClock::Default()->NowMicros() + wait_time;
|
|
|
|
MutexLock l(&mutex_);
|
|
|
|
while (!done_with_sleep_) {
|
|
|
|
if (bg_cv_.TimedWait(abs_time)) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
Add OptionsUtil::LoadOptionsFromFile() API
Summary:
This patch adds OptionsUtil::LoadOptionsFromFile() and
OptionsUtil::LoadLatestOptionsFromDB(), which allow developers
to construct DBOptions and ColumnFamilyOptions from a RocksDB
options file. Note that most pointer-typed options such as
merge_operator will not be constructed.
With this API, developers no longer need to remember all the
options in order to reopen an existing rocksdb instance like
the following:
DBOptions db_options;
std::vector<std::string> cf_names;
std::vector<ColumnFamilyOptions> cf_opts;
// Load primitive-typed options from an existing DB
OptionsUtil::LoadLatestOptionsFromDB(
dbname, &db_options, &cf_names, &cf_opts);
// Initialize necessary pointer-typed options
cf_opts[0].merge_operator.reset(new MyMergeOperator());
...
// Construct the vector of ColumnFamilyDescriptor
std::vector<ColumnFamilyDescriptor> cf_descs;
for (size_t i = 0; i < cf_opts.size(); ++i) {
cf_descs.emplace_back(cf_names[i], cf_opts[i]);
}
// Open the DB
DB* db = nullptr;
std::vector<ColumnFamilyHandle*> cf_handles;
auto s = DB::Open(db_options, dbname, cf_descs,
&handles, &db);
Test Plan:
Augment existing tests in column_family_test
options_test
db_test
Reviewers: igor, IslamAbdelRahman, sdong, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D49095
2015-11-12 14:52:43 +00:00
|
|
|
std::string RandomName(Random* rnd, const size_t len) {
|
|
|
|
std::stringstream ss;
|
|
|
|
for (size_t i = 0; i < len; ++i) {
|
|
|
|
ss << static_cast<char>(rnd->Uniform(26) + 'a');
|
|
|
|
}
|
|
|
|
return ss.str();
|
|
|
|
}
|
|
|
|
|
|
|
|
CompressionType RandomCompressionType(Random* rnd) {
|
2019-06-04 02:47:02 +00:00
|
|
|
auto ret = static_cast<CompressionType>(rnd->Uniform(6));
|
|
|
|
while (!CompressionTypeSupported(ret)) {
|
|
|
|
ret = static_cast<CompressionType>((static_cast<int>(ret) + 1) % 6);
|
|
|
|
}
|
|
|
|
return ret;
|
Add OptionsUtil::LoadOptionsFromFile() API
Summary:
This patch adds OptionsUtil::LoadOptionsFromFile() and
OptionsUtil::LoadLatestOptionsFromDB(), which allow developers
to construct DBOptions and ColumnFamilyOptions from a RocksDB
options file. Note that most pointer-typed options such as
merge_operator will not be constructed.
With this API, developers no longer need to remember all the
options in order to reopen an existing rocksdb instance like
the following:
DBOptions db_options;
std::vector<std::string> cf_names;
std::vector<ColumnFamilyOptions> cf_opts;
// Load primitive-typed options from an existing DB
OptionsUtil::LoadLatestOptionsFromDB(
dbname, &db_options, &cf_names, &cf_opts);
// Initialize necessary pointer-typed options
cf_opts[0].merge_operator.reset(new MyMergeOperator());
...
// Construct the vector of ColumnFamilyDescriptor
std::vector<ColumnFamilyDescriptor> cf_descs;
for (size_t i = 0; i < cf_opts.size(); ++i) {
cf_descs.emplace_back(cf_names[i], cf_opts[i]);
}
// Open the DB
DB* db = nullptr;
std::vector<ColumnFamilyHandle*> cf_handles;
auto s = DB::Open(db_options, dbname, cf_descs,
&handles, &db);
Test Plan:
Augment existing tests in column_family_test
options_test
db_test
Reviewers: igor, IslamAbdelRahman, sdong, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D49095
2015-11-12 14:52:43 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void RandomCompressionTypeVector(const size_t count,
|
|
|
|
std::vector<CompressionType>* types,
|
|
|
|
Random* rnd) {
|
|
|
|
types->clear();
|
|
|
|
for (size_t i = 0; i < count; ++i) {
|
|
|
|
types->emplace_back(RandomCompressionType(rnd));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
const SliceTransform* RandomSliceTransform(Random* rnd, int pre_defined) {
|
|
|
|
int random_num = pre_defined >= 0 ? pre_defined : rnd->Uniform(4);
|
|
|
|
switch (random_num) {
|
|
|
|
case 0:
|
|
|
|
return NewFixedPrefixTransform(rnd->Uniform(20) + 1);
|
|
|
|
case 1:
|
|
|
|
return NewCappedPrefixTransform(rnd->Uniform(20) + 1);
|
|
|
|
case 2:
|
|
|
|
return NewNoopTransform();
|
|
|
|
default:
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
BlockBasedTableOptions RandomBlockBasedTableOptions(Random* rnd) {
|
|
|
|
BlockBasedTableOptions opt;
|
|
|
|
opt.cache_index_and_filter_blocks = rnd->Uniform(2);
|
Adding pin_l0_filter_and_index_blocks_in_cache feature and related fixes.
Summary:
When a block based table file is opened, if prefetch_index_and_filter is true, it will prefetch the index and filter blocks, putting them into the block cache.
What this feature adds: when a L0 block based table file is opened, if pin_l0_filter_and_index_blocks_in_cache is true in the options (and prefetch_index_and_filter is true), then the filter and index blocks aren't released back to the block cache at the end of BlockBasedTableReader::Open(). Instead the table reader takes ownership of them, hence pinning them, ie. the LRU cache will never push them out. Meanwhile in the table reader, further accesses will not hit the block cache, thus avoiding lock contention.
Test Plan:
'export TEST_TMPDIR=/dev/shm/ && DISABLE_JEMALLOC=1 OPT=-g make all valgrind_check -j32' is OK.
I didn't run the Java tests, I don't have Java set up on my devserver.
Reviewers: sdong
Reviewed By: sdong
Subscribers: andrewkr, dhruba
Differential Revision: https://reviews.facebook.net/D56133
2016-04-01 17:42:39 +00:00
|
|
|
opt.pin_l0_filter_and_index_blocks_in_cache = rnd->Uniform(2);
|
2018-06-22 22:14:05 +00:00
|
|
|
opt.pin_top_level_index_and_filter = rnd->Uniform(2);
|
Add an option to put first key of each sst block in the index (#5289)
Summary:
The first key is used to defer reading the data block until this file gets to the top of merging iterator's heap. For short range scans, most files never make it to the top of the heap, so this change can reduce read amplification by a lot sometimes.
Consider the following workload. There are a few data streams (we'll be calling them "logs"), each stream consisting of a sequence of blobs (we'll be calling them "records"). Each record is identified by log ID and a sequence number within the log. RocksDB key is concatenation of log ID and sequence number (big endian). Reads are mostly relatively short range scans, each within a single log. Writes are mostly sequential for each log, but writes to different logs are randomly interleaved. Compactions are disabled; instead, when we accumulate a few tens of sst files, we create a new column family and start writing to it.
So, a typical sst file consists of a few ranges of blocks, each range corresponding to one log ID (we use FlushBlockPolicy to cut blocks at log boundaries). A typical read would go like this. First, iterator Seek() reads one block from each sst file. Then a series of Next()s move through one sst file (since writes to each log are mostly sequential) until the subiterator reaches the end of this log in this sst file; then Next() switches to the next sst file and reads sequentially from that, and so on. Often a range scan will only return records from a small number of blocks in small number of sst files; in this case, the cost of initial Seek() reading one block from each file may be bigger than the cost of reading the actually useful blocks.
Neither iterate_upper_bound nor bloom filters can prevent reading one block from each file in Seek(). But this PR can: if the index contains first key from each block, we don't have to read the block until this block actually makes it to the top of merging iterator's heap, so for short range scans we won't read any blocks from most of the sst files.
This PR does the deferred block loading inside value() call. This is not ideal: there's no good way to report an IO error from inside value(). As discussed with siying offline, it would probably be better to change InternalIterator's interface to explicitly fetch deferred value and get status. I'll do it in a separate PR.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5289
Differential Revision: D15256423
Pulled By: al13n321
fbshipit-source-id: 750e4c39ce88e8d41662f701cf6275d9388ba46a
2019-06-25 03:50:35 +00:00
|
|
|
using IndexType = BlockBasedTableOptions::IndexType;
|
|
|
|
const std::array<IndexType, 4> index_types = {
|
|
|
|
{IndexType::kBinarySearch, IndexType::kHashSearch,
|
|
|
|
IndexType::kTwoLevelIndexSearch, IndexType::kBinarySearchWithFirstKey}};
|
|
|
|
opt.index_type =
|
|
|
|
index_types[rnd->Uniform(static_cast<int>(index_types.size()))];
|
Add OptionsUtil::LoadOptionsFromFile() API
Summary:
This patch adds OptionsUtil::LoadOptionsFromFile() and
OptionsUtil::LoadLatestOptionsFromDB(), which allow developers
to construct DBOptions and ColumnFamilyOptions from a RocksDB
options file. Note that most pointer-typed options such as
merge_operator will not be constructed.
With this API, developers no longer need to remember all the
options in order to reopen an existing rocksdb instance like
the following:
DBOptions db_options;
std::vector<std::string> cf_names;
std::vector<ColumnFamilyOptions> cf_opts;
// Load primitive-typed options from an existing DB
OptionsUtil::LoadLatestOptionsFromDB(
dbname, &db_options, &cf_names, &cf_opts);
// Initialize necessary pointer-typed options
cf_opts[0].merge_operator.reset(new MyMergeOperator());
...
// Construct the vector of ColumnFamilyDescriptor
std::vector<ColumnFamilyDescriptor> cf_descs;
for (size_t i = 0; i < cf_opts.size(); ++i) {
cf_descs.emplace_back(cf_names[i], cf_opts[i]);
}
// Open the DB
DB* db = nullptr;
std::vector<ColumnFamilyHandle*> cf_handles;
auto s = DB::Open(db_options, dbname, cf_descs,
&handles, &db);
Test Plan:
Augment existing tests in column_family_test
options_test
db_test
Reviewers: igor, IslamAbdelRahman, sdong, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D49095
2015-11-12 14:52:43 +00:00
|
|
|
opt.checksum = static_cast<ChecksumType>(rnd->Uniform(3));
|
|
|
|
opt.block_size = rnd->Uniform(10000000);
|
|
|
|
opt.block_size_deviation = rnd->Uniform(100);
|
|
|
|
opt.block_restart_interval = rnd->Uniform(100);
|
2016-02-05 18:22:37 +00:00
|
|
|
opt.index_block_restart_interval = rnd->Uniform(100);
|
Add OptionsUtil::LoadOptionsFromFile() API
Summary:
This patch adds OptionsUtil::LoadOptionsFromFile() and
OptionsUtil::LoadLatestOptionsFromDB(), which allow developers
to construct DBOptions and ColumnFamilyOptions from a RocksDB
options file. Note that most pointer-typed options such as
merge_operator will not be constructed.
With this API, developers no longer need to remember all the
options in order to reopen an existing rocksdb instance like
the following:
DBOptions db_options;
std::vector<std::string> cf_names;
std::vector<ColumnFamilyOptions> cf_opts;
// Load primitive-typed options from an existing DB
OptionsUtil::LoadLatestOptionsFromDB(
dbname, &db_options, &cf_names, &cf_opts);
// Initialize necessary pointer-typed options
cf_opts[0].merge_operator.reset(new MyMergeOperator());
...
// Construct the vector of ColumnFamilyDescriptor
std::vector<ColumnFamilyDescriptor> cf_descs;
for (size_t i = 0; i < cf_opts.size(); ++i) {
cf_descs.emplace_back(cf_names[i], cf_opts[i]);
}
// Open the DB
DB* db = nullptr;
std::vector<ColumnFamilyHandle*> cf_handles;
auto s = DB::Open(db_options, dbname, cf_descs,
&handles, &db);
Test Plan:
Augment existing tests in column_family_test
options_test
db_test
Reviewers: igor, IslamAbdelRahman, sdong, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D49095
2015-11-12 14:52:43 +00:00
|
|
|
opt.whole_key_filtering = rnd->Uniform(2);
|
|
|
|
|
|
|
|
return opt;
|
|
|
|
}
|
|
|
|
|
|
|
|
TableFactory* RandomTableFactory(Random* rnd, int pre_defined) {
|
|
|
|
int random_num = pre_defined >= 0 ? pre_defined : rnd->Uniform(4);
|
|
|
|
switch (random_num) {
|
|
|
|
case 0:
|
|
|
|
return NewPlainTableFactory();
|
|
|
|
case 1:
|
|
|
|
return NewCuckooTableFactory();
|
|
|
|
default:
|
|
|
|
return NewBlockBasedTableFactory();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
MergeOperator* RandomMergeOperator(Random* rnd) {
|
|
|
|
return new ChanglingMergeOperator(RandomName(rnd, 10));
|
|
|
|
}
|
|
|
|
|
|
|
|
CompactionFilter* RandomCompactionFilter(Random* rnd) {
|
|
|
|
return new ChanglingCompactionFilter(RandomName(rnd, 10));
|
|
|
|
}
|
|
|
|
|
|
|
|
CompactionFilterFactory* RandomCompactionFilterFactory(Random* rnd) {
|
|
|
|
return new ChanglingCompactionFilterFactory(RandomName(rnd, 10));
|
|
|
|
}
|
|
|
|
|
|
|
|
void RandomInitDBOptions(DBOptions* db_opt, Random* rnd) {
|
|
|
|
// boolean options
|
|
|
|
db_opt->advise_random_on_open = rnd->Uniform(2);
|
|
|
|
db_opt->allow_mmap_reads = rnd->Uniform(2);
|
|
|
|
db_opt->allow_mmap_writes = rnd->Uniform(2);
|
2016-12-22 20:51:29 +00:00
|
|
|
db_opt->use_direct_reads = rnd->Uniform(2);
|
2017-04-13 20:07:33 +00:00
|
|
|
db_opt->use_direct_io_for_flush_and_compaction = rnd->Uniform(2);
|
Add OptionsUtil::LoadOptionsFromFile() API
Summary:
This patch adds OptionsUtil::LoadOptionsFromFile() and
OptionsUtil::LoadLatestOptionsFromDB(), which allow developers
to construct DBOptions and ColumnFamilyOptions from a RocksDB
options file. Note that most pointer-typed options such as
merge_operator will not be constructed.
With this API, developers no longer need to remember all the
options in order to reopen an existing rocksdb instance like
the following:
DBOptions db_options;
std::vector<std::string> cf_names;
std::vector<ColumnFamilyOptions> cf_opts;
// Load primitive-typed options from an existing DB
OptionsUtil::LoadLatestOptionsFromDB(
dbname, &db_options, &cf_names, &cf_opts);
// Initialize necessary pointer-typed options
cf_opts[0].merge_operator.reset(new MyMergeOperator());
...
// Construct the vector of ColumnFamilyDescriptor
std::vector<ColumnFamilyDescriptor> cf_descs;
for (size_t i = 0; i < cf_opts.size(); ++i) {
cf_descs.emplace_back(cf_names[i], cf_opts[i]);
}
// Open the DB
DB* db = nullptr;
std::vector<ColumnFamilyHandle*> cf_handles;
auto s = DB::Open(db_options, dbname, cf_descs,
&handles, &db);
Test Plan:
Augment existing tests in column_family_test
options_test
db_test
Reviewers: igor, IslamAbdelRahman, sdong, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D49095
2015-11-12 14:52:43 +00:00
|
|
|
db_opt->create_if_missing = rnd->Uniform(2);
|
|
|
|
db_opt->create_missing_column_families = rnd->Uniform(2);
|
|
|
|
db_opt->enable_thread_tracking = rnd->Uniform(2);
|
|
|
|
db_opt->error_if_exists = rnd->Uniform(2);
|
|
|
|
db_opt->is_fd_close_on_exec = rnd->Uniform(2);
|
|
|
|
db_opt->paranoid_checks = rnd->Uniform(2);
|
2020-10-09 23:40:25 +00:00
|
|
|
db_opt->track_and_verify_wals_in_manifest = rnd->Uniform(2);
|
2022-05-19 18:04:21 +00:00
|
|
|
db_opt->verify_sst_unique_id_in_manifest = rnd->Uniform(2);
|
Add OptionsUtil::LoadOptionsFromFile() API
Summary:
This patch adds OptionsUtil::LoadOptionsFromFile() and
OptionsUtil::LoadLatestOptionsFromDB(), which allow developers
to construct DBOptions and ColumnFamilyOptions from a RocksDB
options file. Note that most pointer-typed options such as
merge_operator will not be constructed.
With this API, developers no longer need to remember all the
options in order to reopen an existing rocksdb instance like
the following:
DBOptions db_options;
std::vector<std::string> cf_names;
std::vector<ColumnFamilyOptions> cf_opts;
// Load primitive-typed options from an existing DB
OptionsUtil::LoadLatestOptionsFromDB(
dbname, &db_options, &cf_names, &cf_opts);
// Initialize necessary pointer-typed options
cf_opts[0].merge_operator.reset(new MyMergeOperator());
...
// Construct the vector of ColumnFamilyDescriptor
std::vector<ColumnFamilyDescriptor> cf_descs;
for (size_t i = 0; i < cf_opts.size(); ++i) {
cf_descs.emplace_back(cf_names[i], cf_opts[i]);
}
// Open the DB
DB* db = nullptr;
std::vector<ColumnFamilyHandle*> cf_handles;
auto s = DB::Open(db_options, dbname, cf_descs,
&handles, &db);
Test Plan:
Augment existing tests in column_family_test
options_test
db_test
Reviewers: igor, IslamAbdelRahman, sdong, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D49095
2015-11-12 14:52:43 +00:00
|
|
|
db_opt->skip_stats_update_on_db_open = rnd->Uniform(2);
|
Add an option to prevent DB::Open() from querying sizes of all sst files (#6353)
Summary:
When paranoid_checks is on, DBImpl::CheckConsistency() iterates over all sst files and calls Env::GetFileSize() for each of them. As far as I could understand, this is pretty arbitrary and doesn't affect correctness - if filesystem doesn't corrupt fsynced files, the file sizes will always match; if it does, it may as well corrupt contents as well as sizes, and rocksdb doesn't check contents on open.
If there are thousands of sst files, getting all their sizes takes a while. If, on top of that, Env is overridden to use some remote storage instead of local filesystem, it can be *really* slow and overload the remote storage service. This PR adds an option to not do GetFileSize(); instead it does GetChildren() for parent directory to check that all the expected sst files are at least present, but doesn't check their sizes.
We can't just disable paranoid_checks instead because paranoid_checks do a few other important things: make the DB read-only on write errors, print error messages on read errors, etc.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6353
Test Plan: ran the added sanity check unit test. Will try it out in a LogDevice test cluster where the GetFileSize() calls are causing a lot of trouble.
Differential Revision: D19656425
Pulled By: al13n321
fbshipit-source-id: c2c421b367633033760d1f56747bad206d1fbf82
2020-02-04 09:24:29 +00:00
|
|
|
db_opt->skip_checking_sst_file_sizes_on_db_open = rnd->Uniform(2);
|
Add OptionsUtil::LoadOptionsFromFile() API
Summary:
This patch adds OptionsUtil::LoadOptionsFromFile() and
OptionsUtil::LoadLatestOptionsFromDB(), which allow developers
to construct DBOptions and ColumnFamilyOptions from a RocksDB
options file. Note that most pointer-typed options such as
merge_operator will not be constructed.
With this API, developers no longer need to remember all the
options in order to reopen an existing rocksdb instance like
the following:
DBOptions db_options;
std::vector<std::string> cf_names;
std::vector<ColumnFamilyOptions> cf_opts;
// Load primitive-typed options from an existing DB
OptionsUtil::LoadLatestOptionsFromDB(
dbname, &db_options, &cf_names, &cf_opts);
// Initialize necessary pointer-typed options
cf_opts[0].merge_operator.reset(new MyMergeOperator());
...
// Construct the vector of ColumnFamilyDescriptor
std::vector<ColumnFamilyDescriptor> cf_descs;
for (size_t i = 0; i < cf_opts.size(); ++i) {
cf_descs.emplace_back(cf_names[i], cf_opts[i]);
}
// Open the DB
DB* db = nullptr;
std::vector<ColumnFamilyHandle*> cf_handles;
auto s = DB::Open(db_options, dbname, cf_descs,
&handles, &db);
Test Plan:
Augment existing tests in column_family_test
options_test
db_test
Reviewers: igor, IslamAbdelRahman, sdong, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D49095
2015-11-12 14:52:43 +00:00
|
|
|
db_opt->use_adaptive_mutex = rnd->Uniform(2);
|
|
|
|
db_opt->use_fsync = rnd->Uniform(2);
|
|
|
|
db_opt->recycle_log_file_num = rnd->Uniform(2);
|
2016-11-02 22:22:13 +00:00
|
|
|
db_opt->avoid_flush_during_recovery = rnd->Uniform(2);
|
|
|
|
db_opt->avoid_flush_during_shutdown = rnd->Uniform(2);
|
2022-05-20 23:48:50 +00:00
|
|
|
db_opt->enforce_single_del_contracts = rnd->Uniform(2);
|
Add OptionsUtil::LoadOptionsFromFile() API
Summary:
This patch adds OptionsUtil::LoadOptionsFromFile() and
OptionsUtil::LoadLatestOptionsFromDB(), which allow developers
to construct DBOptions and ColumnFamilyOptions from a RocksDB
options file. Note that most pointer-typed options such as
merge_operator will not be constructed.
With this API, developers no longer need to remember all the
options in order to reopen an existing rocksdb instance like
the following:
DBOptions db_options;
std::vector<std::string> cf_names;
std::vector<ColumnFamilyOptions> cf_opts;
// Load primitive-typed options from an existing DB
OptionsUtil::LoadLatestOptionsFromDB(
dbname, &db_options, &cf_names, &cf_opts);
// Initialize necessary pointer-typed options
cf_opts[0].merge_operator.reset(new MyMergeOperator());
...
// Construct the vector of ColumnFamilyDescriptor
std::vector<ColumnFamilyDescriptor> cf_descs;
for (size_t i = 0; i < cf_opts.size(); ++i) {
cf_descs.emplace_back(cf_names[i], cf_opts[i]);
}
// Open the DB
DB* db = nullptr;
std::vector<ColumnFamilyHandle*> cf_handles;
auto s = DB::Open(db_options, dbname, cf_descs,
&handles, &db);
Test Plan:
Augment existing tests in column_family_test
options_test
db_test
Reviewers: igor, IslamAbdelRahman, sdong, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D49095
2015-11-12 14:52:43 +00:00
|
|
|
|
|
|
|
// int options
|
|
|
|
db_opt->max_background_compactions = rnd->Uniform(100);
|
|
|
|
db_opt->max_background_flushes = rnd->Uniform(100);
|
|
|
|
db_opt->max_file_opening_threads = rnd->Uniform(100);
|
|
|
|
db_opt->max_open_files = rnd->Uniform(100);
|
|
|
|
db_opt->table_cache_numshardbits = rnd->Uniform(100);
|
|
|
|
|
|
|
|
// size_t options
|
|
|
|
db_opt->db_write_buffer_size = rnd->Uniform(10000);
|
|
|
|
db_opt->keep_log_file_num = rnd->Uniform(10000);
|
|
|
|
db_opt->log_file_time_to_roll = rnd->Uniform(10000);
|
|
|
|
db_opt->manifest_preallocation_size = rnd->Uniform(10000);
|
|
|
|
db_opt->max_log_file_size = rnd->Uniform(10000);
|
|
|
|
|
|
|
|
// std::string options
|
|
|
|
db_opt->db_log_dir = "path/to/db_log_dir";
|
|
|
|
db_opt->wal_dir = "path/to/wal_dir";
|
|
|
|
|
|
|
|
// uint32_t options
|
|
|
|
db_opt->max_subcompactions = rnd->Uniform(100000);
|
|
|
|
|
|
|
|
// uint64_t options
|
|
|
|
static const uint64_t uint_max = static_cast<uint64_t>(UINT_MAX);
|
|
|
|
db_opt->WAL_size_limit_MB = uint_max + rnd->Uniform(100000);
|
|
|
|
db_opt->WAL_ttl_seconds = uint_max + rnd->Uniform(100000);
|
|
|
|
db_opt->bytes_per_sync = uint_max + rnd->Uniform(100000);
|
|
|
|
db_opt->delayed_write_rate = uint_max + rnd->Uniform(100000);
|
|
|
|
db_opt->delete_obsolete_files_period_micros = uint_max + rnd->Uniform(100000);
|
|
|
|
db_opt->max_manifest_file_size = uint_max + rnd->Uniform(100000);
|
|
|
|
db_opt->max_total_wal_size = uint_max + rnd->Uniform(100000);
|
|
|
|
db_opt->wal_bytes_per_sync = uint_max + rnd->Uniform(100000);
|
|
|
|
|
|
|
|
// unsigned int options
|
|
|
|
db_opt->stats_dump_period_sec = rnd->Uniform(100000);
|
|
|
|
}
|
|
|
|
|
2019-06-04 02:47:02 +00:00
|
|
|
void RandomInitCFOptions(ColumnFamilyOptions* cf_opt, DBOptions& db_options,
|
|
|
|
Random* rnd) {
|
Add OptionsUtil::LoadOptionsFromFile() API
Summary:
This patch adds OptionsUtil::LoadOptionsFromFile() and
OptionsUtil::LoadLatestOptionsFromDB(), which allow developers
to construct DBOptions and ColumnFamilyOptions from a RocksDB
options file. Note that most pointer-typed options such as
merge_operator will not be constructed.
With this API, developers no longer need to remember all the
options in order to reopen an existing rocksdb instance like
the following:
DBOptions db_options;
std::vector<std::string> cf_names;
std::vector<ColumnFamilyOptions> cf_opts;
// Load primitive-typed options from an existing DB
OptionsUtil::LoadLatestOptionsFromDB(
dbname, &db_options, &cf_names, &cf_opts);
// Initialize necessary pointer-typed options
cf_opts[0].merge_operator.reset(new MyMergeOperator());
...
// Construct the vector of ColumnFamilyDescriptor
std::vector<ColumnFamilyDescriptor> cf_descs;
for (size_t i = 0; i < cf_opts.size(); ++i) {
cf_descs.emplace_back(cf_names[i], cf_opts[i]);
}
// Open the DB
DB* db = nullptr;
std::vector<ColumnFamilyHandle*> cf_handles;
auto s = DB::Open(db_options, dbname, cf_descs,
&handles, &db);
Test Plan:
Augment existing tests in column_family_test
options_test
db_test
Reviewers: igor, IslamAbdelRahman, sdong, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D49095
2015-11-12 14:52:43 +00:00
|
|
|
cf_opt->compaction_style = (CompactionStyle)(rnd->Uniform(4));
|
|
|
|
|
|
|
|
// boolean options
|
2016-04-14 20:56:29 +00:00
|
|
|
cf_opt->report_bg_io_stats = rnd->Uniform(2);
|
Add OptionsUtil::LoadOptionsFromFile() API
Summary:
This patch adds OptionsUtil::LoadOptionsFromFile() and
OptionsUtil::LoadLatestOptionsFromDB(), which allow developers
to construct DBOptions and ColumnFamilyOptions from a RocksDB
options file. Note that most pointer-typed options such as
merge_operator will not be constructed.
With this API, developers no longer need to remember all the
options in order to reopen an existing rocksdb instance like
the following:
DBOptions db_options;
std::vector<std::string> cf_names;
std::vector<ColumnFamilyOptions> cf_opts;
// Load primitive-typed options from an existing DB
OptionsUtil::LoadLatestOptionsFromDB(
dbname, &db_options, &cf_names, &cf_opts);
// Initialize necessary pointer-typed options
cf_opts[0].merge_operator.reset(new MyMergeOperator());
...
// Construct the vector of ColumnFamilyDescriptor
std::vector<ColumnFamilyDescriptor> cf_descs;
for (size_t i = 0; i < cf_opts.size(); ++i) {
cf_descs.emplace_back(cf_names[i], cf_opts[i]);
}
// Open the DB
DB* db = nullptr;
std::vector<ColumnFamilyHandle*> cf_handles;
auto s = DB::Open(db_options, dbname, cf_descs,
&handles, &db);
Test Plan:
Augment existing tests in column_family_test
options_test
db_test
Reviewers: igor, IslamAbdelRahman, sdong, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D49095
2015-11-12 14:52:43 +00:00
|
|
|
cf_opt->disable_auto_compactions = rnd->Uniform(2);
|
|
|
|
cf_opt->inplace_update_support = rnd->Uniform(2);
|
|
|
|
cf_opt->level_compaction_dynamic_level_bytes = rnd->Uniform(2);
|
|
|
|
cf_opt->optimize_filters_for_hits = rnd->Uniform(2);
|
|
|
|
cf_opt->paranoid_file_checks = rnd->Uniform(2);
|
2016-10-08 00:21:45 +00:00
|
|
|
cf_opt->force_consistency_checks = rnd->Uniform(2);
|
2017-10-19 22:19:20 +00:00
|
|
|
cf_opt->compaction_options_fifo.allow_compaction = rnd->Uniform(2);
|
2019-02-19 20:12:25 +00:00
|
|
|
cf_opt->memtable_whole_key_filtering = rnd->Uniform(2);
|
2020-08-19 01:31:31 +00:00
|
|
|
cf_opt->enable_blob_files = rnd->Uniform(2);
|
2020-11-13 02:57:20 +00:00
|
|
|
cf_opt->enable_blob_garbage_collection = rnd->Uniform(2);
|
Add OptionsUtil::LoadOptionsFromFile() API
Summary:
This patch adds OptionsUtil::LoadOptionsFromFile() and
OptionsUtil::LoadLatestOptionsFromDB(), which allow developers
to construct DBOptions and ColumnFamilyOptions from a RocksDB
options file. Note that most pointer-typed options such as
merge_operator will not be constructed.
With this API, developers no longer need to remember all the
options in order to reopen an existing rocksdb instance like
the following:
DBOptions db_options;
std::vector<std::string> cf_names;
std::vector<ColumnFamilyOptions> cf_opts;
// Load primitive-typed options from an existing DB
OptionsUtil::LoadLatestOptionsFromDB(
dbname, &db_options, &cf_names, &cf_opts);
// Initialize necessary pointer-typed options
cf_opts[0].merge_operator.reset(new MyMergeOperator());
...
// Construct the vector of ColumnFamilyDescriptor
std::vector<ColumnFamilyDescriptor> cf_descs;
for (size_t i = 0; i < cf_opts.size(); ++i) {
cf_descs.emplace_back(cf_names[i], cf_opts[i]);
}
// Open the DB
DB* db = nullptr;
std::vector<ColumnFamilyHandle*> cf_handles;
auto s = DB::Open(db_options, dbname, cf_descs,
&handles, &db);
Test Plan:
Augment existing tests in column_family_test
options_test
db_test
Reviewers: igor, IslamAbdelRahman, sdong, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D49095
2015-11-12 14:52:43 +00:00
|
|
|
|
|
|
|
// double options
|
2016-06-04 00:02:10 +00:00
|
|
|
cf_opt->memtable_prefix_bloom_size_ratio =
|
|
|
|
static_cast<double>(rnd->Uniform(10000)) / 20000.0;
|
2020-11-13 02:57:20 +00:00
|
|
|
cf_opt->blob_garbage_collection_age_cutoff = rnd->Uniform(10000) / 10000.0;
|
Make it possible to force the garbage collection of the oldest blob files (#8994)
Summary:
The current BlobDB garbage collection logic works by relocating the valid
blobs from the oldest blob files as they are encountered during compaction,
and cleaning up blob files once they contain nothing but garbage. However,
with sufficiently skewed workloads, it is theoretically possible to end up in a
situation when few or no compactions get scheduled for the SST files that contain
references to the oldest blob files, which can lead to increased space amp due
to the lack of GC.
In order to efficiently handle such workloads, the patch adds a new BlobDB
configuration option called `blob_garbage_collection_force_threshold`,
which signals to BlobDB to schedule targeted compactions for the SST files
that keep alive the oldest batch of blob files if the overall ratio of garbage in
the given blob files meets the threshold *and* all the given blob files are
eligible for GC based on `blob_garbage_collection_age_cutoff`. (For example,
if the new option is set to 0.9, targeted compactions will get scheduled if the
sum of garbage bytes meets or exceeds 90% of the sum of total bytes in the
oldest blob files, assuming all affected blob files are below the age-based cutoff.)
The net result of these targeted compactions is that the valid blobs in the oldest
blob files are relocated and the oldest blob files themselves cleaned up (since
*all* SST files that rely on them get compacted away).
These targeted compactions are similar to periodic compactions in the sense
that they force certain SST files that otherwise would not get picked up to undergo
compaction and also in the sense that instead of merging files from multiple levels,
they target a single file. (Note: such compactions might still include neighboring files
from the same level due to the need of having a "clean cut" boundary but they never
include any files from any other level.)
This functionality is currently only supported with the leveled compaction style
and is inactive by default (since the default value is set to 1.0, i.e. 100%).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8994
Test Plan: Ran `make check` and tested using `db_bench` and the stress/crash tests.
Reviewed By: riversand963
Differential Revision: D31489850
Pulled By: ltamasi
fbshipit-source-id: 44057d511726a0e2a03c5d9313d7511b3f0c4eab
2021-10-12 01:00:44 +00:00
|
|
|
cf_opt->blob_garbage_collection_force_threshold =
|
|
|
|
rnd->Uniform(10000) / 10000.0;
|
Add OptionsUtil::LoadOptionsFromFile() API
Summary:
This patch adds OptionsUtil::LoadOptionsFromFile() and
OptionsUtil::LoadLatestOptionsFromDB(), which allow developers
to construct DBOptions and ColumnFamilyOptions from a RocksDB
options file. Note that most pointer-typed options such as
merge_operator will not be constructed.
With this API, developers no longer need to remember all the
options in order to reopen an existing rocksdb instance like
the following:
DBOptions db_options;
std::vector<std::string> cf_names;
std::vector<ColumnFamilyOptions> cf_opts;
// Load primitive-typed options from an existing DB
OptionsUtil::LoadLatestOptionsFromDB(
dbname, &db_options, &cf_names, &cf_opts);
// Initialize necessary pointer-typed options
cf_opts[0].merge_operator.reset(new MyMergeOperator());
...
// Construct the vector of ColumnFamilyDescriptor
std::vector<ColumnFamilyDescriptor> cf_descs;
for (size_t i = 0; i < cf_opts.size(); ++i) {
cf_descs.emplace_back(cf_names[i], cf_opts[i]);
}
// Open the DB
DB* db = nullptr;
std::vector<ColumnFamilyHandle*> cf_handles;
auto s = DB::Open(db_options, dbname, cf_descs,
&handles, &db);
Test Plan:
Augment existing tests in column_family_test
options_test
db_test
Reviewers: igor, IslamAbdelRahman, sdong, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D49095
2015-11-12 14:52:43 +00:00
|
|
|
|
|
|
|
// int options
|
|
|
|
cf_opt->level0_file_num_compaction_trigger = rnd->Uniform(100);
|
|
|
|
cf_opt->level0_slowdown_writes_trigger = rnd->Uniform(100);
|
|
|
|
cf_opt->level0_stop_writes_trigger = rnd->Uniform(100);
|
|
|
|
cf_opt->max_bytes_for_level_multiplier = rnd->Uniform(100);
|
|
|
|
cf_opt->max_write_buffer_number = rnd->Uniform(100);
|
|
|
|
cf_opt->max_write_buffer_number_to_maintain = rnd->Uniform(100);
|
Refactor trimming logic for immutable memtables (#5022)
Summary:
MyRocks currently sets `max_write_buffer_number_to_maintain` in order to maintain enough history for transaction conflict checking. The effectiveness of this approach depends on the size of memtables. When memtables are small, it may not keep enough history; when memtables are large, this may consume too much memory.
We are proposing a new way to configure memtable list history: by limiting the memory usage of immutable memtables. The new option is `max_write_buffer_size_to_maintain` and it will take precedence over the old `max_write_buffer_number_to_maintain` if they are both set to non-zero values. The new option accounts for the total memory usage of flushed immutable memtables and mutable memtable. When the total usage exceeds the limit, RocksDB may start dropping immutable memtables (which is also called trimming history), starting from the oldest one.
The semantics of the old option actually works both as an upper bound and lower bound. History trimming will start if number of immutable memtables exceeds the limit, but it will never go below (limit-1) due to history trimming.
In order the mimic the behavior with the new option, history trimming will stop if dropping the next immutable memtable causes the total memory usage go below the size limit. For example, assuming the size limit is set to 64MB, and there are 3 immutable memtables with sizes of 20, 30, 30. Although the total memory usage is 80MB > 64MB, dropping the oldest memtable will reduce the memory usage to 60MB < 64MB, so in this case no memtable will be dropped.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5022
Differential Revision: D14394062
Pulled By: miasantreble
fbshipit-source-id: 60457a509c6af89d0993f988c9b5c2aa9e45f5c5
2019-08-23 20:54:09 +00:00
|
|
|
cf_opt->max_write_buffer_size_to_maintain = rnd->Uniform(10000);
|
Add OptionsUtil::LoadOptionsFromFile() API
Summary:
This patch adds OptionsUtil::LoadOptionsFromFile() and
OptionsUtil::LoadLatestOptionsFromDB(), which allow developers
to construct DBOptions and ColumnFamilyOptions from a RocksDB
options file. Note that most pointer-typed options such as
merge_operator will not be constructed.
With this API, developers no longer need to remember all the
options in order to reopen an existing rocksdb instance like
the following:
DBOptions db_options;
std::vector<std::string> cf_names;
std::vector<ColumnFamilyOptions> cf_opts;
// Load primitive-typed options from an existing DB
OptionsUtil::LoadLatestOptionsFromDB(
dbname, &db_options, &cf_names, &cf_opts);
// Initialize necessary pointer-typed options
cf_opts[0].merge_operator.reset(new MyMergeOperator());
...
// Construct the vector of ColumnFamilyDescriptor
std::vector<ColumnFamilyDescriptor> cf_descs;
for (size_t i = 0; i < cf_opts.size(); ++i) {
cf_descs.emplace_back(cf_names[i], cf_opts[i]);
}
// Open the DB
DB* db = nullptr;
std::vector<ColumnFamilyHandle*> cf_handles;
auto s = DB::Open(db_options, dbname, cf_descs,
&handles, &db);
Test Plan:
Augment existing tests in column_family_test
options_test
db_test
Reviewers: igor, IslamAbdelRahman, sdong, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D49095
2015-11-12 14:52:43 +00:00
|
|
|
cf_opt->min_write_buffer_number_to_merge = rnd->Uniform(100);
|
|
|
|
cf_opt->num_levels = rnd->Uniform(100);
|
|
|
|
cf_opt->target_file_size_multiplier = rnd->Uniform(100);
|
|
|
|
|
2016-09-14 04:12:43 +00:00
|
|
|
// vector int options
|
|
|
|
cf_opt->max_bytes_for_level_multiplier_additional.resize(cf_opt->num_levels);
|
|
|
|
for (int i = 0; i < cf_opt->num_levels; i++) {
|
|
|
|
cf_opt->max_bytes_for_level_multiplier_additional[i] = rnd->Uniform(100);
|
|
|
|
}
|
|
|
|
|
Add OptionsUtil::LoadOptionsFromFile() API
Summary:
This patch adds OptionsUtil::LoadOptionsFromFile() and
OptionsUtil::LoadLatestOptionsFromDB(), which allow developers
to construct DBOptions and ColumnFamilyOptions from a RocksDB
options file. Note that most pointer-typed options such as
merge_operator will not be constructed.
With this API, developers no longer need to remember all the
options in order to reopen an existing rocksdb instance like
the following:
DBOptions db_options;
std::vector<std::string> cf_names;
std::vector<ColumnFamilyOptions> cf_opts;
// Load primitive-typed options from an existing DB
OptionsUtil::LoadLatestOptionsFromDB(
dbname, &db_options, &cf_names, &cf_opts);
// Initialize necessary pointer-typed options
cf_opts[0].merge_operator.reset(new MyMergeOperator());
...
// Construct the vector of ColumnFamilyDescriptor
std::vector<ColumnFamilyDescriptor> cf_descs;
for (size_t i = 0; i < cf_opts.size(); ++i) {
cf_descs.emplace_back(cf_names[i], cf_opts[i]);
}
// Open the DB
DB* db = nullptr;
std::vector<ColumnFamilyHandle*> cf_handles;
auto s = DB::Open(db_options, dbname, cf_descs,
&handles, &db);
Test Plan:
Augment existing tests in column_family_test
options_test
db_test
Reviewers: igor, IslamAbdelRahman, sdong, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D49095
2015-11-12 14:52:43 +00:00
|
|
|
// size_t options
|
|
|
|
cf_opt->arena_block_size = rnd->Uniform(10000);
|
|
|
|
cf_opt->inplace_update_num_locks = rnd->Uniform(10000);
|
2020-01-11 00:51:34 +00:00
|
|
|
cf_opt->max_successive_merges = rnd->Uniform(10000);
|
2016-07-27 01:05:30 +00:00
|
|
|
cf_opt->memtable_huge_page_size = rnd->Uniform(10000);
|
Add OptionsUtil::LoadOptionsFromFile() API
Summary:
This patch adds OptionsUtil::LoadOptionsFromFile() and
OptionsUtil::LoadLatestOptionsFromDB(), which allow developers
to construct DBOptions and ColumnFamilyOptions from a RocksDB
options file. Note that most pointer-typed options such as
merge_operator will not be constructed.
With this API, developers no longer need to remember all the
options in order to reopen an existing rocksdb instance like
the following:
DBOptions db_options;
std::vector<std::string> cf_names;
std::vector<ColumnFamilyOptions> cf_opts;
// Load primitive-typed options from an existing DB
OptionsUtil::LoadLatestOptionsFromDB(
dbname, &db_options, &cf_names, &cf_opts);
// Initialize necessary pointer-typed options
cf_opts[0].merge_operator.reset(new MyMergeOperator());
...
// Construct the vector of ColumnFamilyDescriptor
std::vector<ColumnFamilyDescriptor> cf_descs;
for (size_t i = 0; i < cf_opts.size(); ++i) {
cf_descs.emplace_back(cf_names[i], cf_opts[i]);
}
// Open the DB
DB* db = nullptr;
std::vector<ColumnFamilyHandle*> cf_handles;
auto s = DB::Open(db_options, dbname, cf_descs,
&handles, &db);
Test Plan:
Augment existing tests in column_family_test
options_test
db_test
Reviewers: igor, IslamAbdelRahman, sdong, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D49095
2015-11-12 14:52:43 +00:00
|
|
|
cf_opt->write_buffer_size = rnd->Uniform(10000);
|
|
|
|
|
|
|
|
// uint32_t options
|
|
|
|
cf_opt->bloom_locality = rnd->Uniform(10000);
|
|
|
|
cf_opt->max_bytes_for_level_base = rnd->Uniform(10000);
|
|
|
|
|
|
|
|
// uint64_t options
|
|
|
|
static const uint64_t uint_max = static_cast<uint64_t>(UINT_MAX);
|
2019-06-04 02:47:02 +00:00
|
|
|
cf_opt->ttl =
|
|
|
|
db_options.max_open_files == -1 ? uint_max + rnd->Uniform(10000) : 0;
|
|
|
|
cf_opt->periodic_compaction_seconds =
|
|
|
|
db_options.max_open_files == -1 ? uint_max + rnd->Uniform(10000) : 0;
|
Add OptionsUtil::LoadOptionsFromFile() API
Summary:
This patch adds OptionsUtil::LoadOptionsFromFile() and
OptionsUtil::LoadLatestOptionsFromDB(), which allow developers
to construct DBOptions and ColumnFamilyOptions from a RocksDB
options file. Note that most pointer-typed options such as
merge_operator will not be constructed.
With this API, developers no longer need to remember all the
options in order to reopen an existing rocksdb instance like
the following:
DBOptions db_options;
std::vector<std::string> cf_names;
std::vector<ColumnFamilyOptions> cf_opts;
// Load primitive-typed options from an existing DB
OptionsUtil::LoadLatestOptionsFromDB(
dbname, &db_options, &cf_names, &cf_opts);
// Initialize necessary pointer-typed options
cf_opts[0].merge_operator.reset(new MyMergeOperator());
...
// Construct the vector of ColumnFamilyDescriptor
std::vector<ColumnFamilyDescriptor> cf_descs;
for (size_t i = 0; i < cf_opts.size(); ++i) {
cf_descs.emplace_back(cf_names[i], cf_opts[i]);
}
// Open the DB
DB* db = nullptr;
std::vector<ColumnFamilyHandle*> cf_handles;
auto s = DB::Open(db_options, dbname, cf_descs,
&handles, &db);
Test Plan:
Augment existing tests in column_family_test
options_test
db_test
Reviewers: igor, IslamAbdelRahman, sdong, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D49095
2015-11-12 14:52:43 +00:00
|
|
|
cf_opt->max_sequential_skip_in_iterations = uint_max + rnd->Uniform(10000);
|
|
|
|
cf_opt->target_file_size_base = uint_max + rnd->Uniform(10000);
|
2016-06-16 23:02:52 +00:00
|
|
|
cf_opt->max_compaction_bytes =
|
|
|
|
cf_opt->target_file_size_base * rnd->Uniform(100);
|
2017-10-19 22:19:20 +00:00
|
|
|
cf_opt->compaction_options_fifo.max_table_files_size =
|
|
|
|
uint_max + rnd->Uniform(10000);
|
2020-08-19 01:31:31 +00:00
|
|
|
cf_opt->min_blob_size = uint_max + rnd->Uniform(10000);
|
|
|
|
cf_opt->blob_file_size = uint_max + rnd->Uniform(10000);
|
2021-11-20 01:52:42 +00:00
|
|
|
cf_opt->blob_compaction_readahead_size = uint_max + rnd->Uniform(10000);
|
Add OptionsUtil::LoadOptionsFromFile() API
Summary:
This patch adds OptionsUtil::LoadOptionsFromFile() and
OptionsUtil::LoadLatestOptionsFromDB(), which allow developers
to construct DBOptions and ColumnFamilyOptions from a RocksDB
options file. Note that most pointer-typed options such as
merge_operator will not be constructed.
With this API, developers no longer need to remember all the
options in order to reopen an existing rocksdb instance like
the following:
DBOptions db_options;
std::vector<std::string> cf_names;
std::vector<ColumnFamilyOptions> cf_opts;
// Load primitive-typed options from an existing DB
OptionsUtil::LoadLatestOptionsFromDB(
dbname, &db_options, &cf_names, &cf_opts);
// Initialize necessary pointer-typed options
cf_opts[0].merge_operator.reset(new MyMergeOperator());
...
// Construct the vector of ColumnFamilyDescriptor
std::vector<ColumnFamilyDescriptor> cf_descs;
for (size_t i = 0; i < cf_opts.size(); ++i) {
cf_descs.emplace_back(cf_names[i], cf_opts[i]);
}
// Open the DB
DB* db = nullptr;
std::vector<ColumnFamilyHandle*> cf_handles;
auto s = DB::Open(db_options, dbname, cf_descs,
&handles, &db);
Test Plan:
Augment existing tests in column_family_test
options_test
db_test
Reviewers: igor, IslamAbdelRahman, sdong, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D49095
2015-11-12 14:52:43 +00:00
|
|
|
|
|
|
|
// pointer typed options
|
|
|
|
cf_opt->prefix_extractor.reset(RandomSliceTransform(rnd));
|
|
|
|
cf_opt->table_factory.reset(RandomTableFactory(rnd));
|
|
|
|
cf_opt->merge_operator.reset(RandomMergeOperator(rnd));
|
|
|
|
if (cf_opt->compaction_filter) {
|
|
|
|
delete cf_opt->compaction_filter;
|
|
|
|
}
|
|
|
|
cf_opt->compaction_filter = RandomCompactionFilter(rnd);
|
|
|
|
cf_opt->compaction_filter_factory.reset(RandomCompactionFilterFactory(rnd));
|
|
|
|
|
|
|
|
// custom typed options
|
|
|
|
cf_opt->compression = RandomCompressionType(rnd);
|
|
|
|
RandomCompressionTypeVector(cf_opt->num_levels,
|
|
|
|
&cf_opt->compression_per_level, rnd);
|
2020-08-19 01:31:31 +00:00
|
|
|
cf_opt->blob_compression_type = RandomCompressionType(rnd);
|
Add OptionsUtil::LoadOptionsFromFile() API
Summary:
This patch adds OptionsUtil::LoadOptionsFromFile() and
OptionsUtil::LoadLatestOptionsFromDB(), which allow developers
to construct DBOptions and ColumnFamilyOptions from a RocksDB
options file. Note that most pointer-typed options such as
merge_operator will not be constructed.
With this API, developers no longer need to remember all the
options in order to reopen an existing rocksdb instance like
the following:
DBOptions db_options;
std::vector<std::string> cf_names;
std::vector<ColumnFamilyOptions> cf_opts;
// Load primitive-typed options from an existing DB
OptionsUtil::LoadLatestOptionsFromDB(
dbname, &db_options, &cf_names, &cf_opts);
// Initialize necessary pointer-typed options
cf_opts[0].merge_operator.reset(new MyMergeOperator());
...
// Construct the vector of ColumnFamilyDescriptor
std::vector<ColumnFamilyDescriptor> cf_descs;
for (size_t i = 0; i < cf_opts.size(); ++i) {
cf_descs.emplace_back(cf_names[i], cf_opts[i]);
}
// Open the DB
DB* db = nullptr;
std::vector<ColumnFamilyHandle*> cf_handles;
auto s = DB::Open(db_options, dbname, cf_descs,
&handles, &db);
Test Plan:
Augment existing tests in column_family_test
options_test
db_test
Reviewers: igor, IslamAbdelRahman, sdong, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D49095
2015-11-12 14:52:43 +00:00
|
|
|
}
|
|
|
|
|
2018-11-13 19:14:41 +00:00
|
|
|
bool IsDirectIOSupported(Env* env, const std::string& dir) {
|
|
|
|
EnvOptions env_options;
|
|
|
|
env_options.use_mmap_writes = false;
|
|
|
|
env_options.use_direct_writes = true;
|
|
|
|
std::string tmp = TempFileName(dir, 999);
|
|
|
|
Status s;
|
|
|
|
{
|
|
|
|
std::unique_ptr<WritableFile> file;
|
|
|
|
s = env->NewWritableFile(tmp, &file, env_options);
|
|
|
|
}
|
|
|
|
if (s.ok()) {
|
|
|
|
s = env->DeleteFile(tmp);
|
|
|
|
}
|
|
|
|
return s.ok();
|
|
|
|
}
|
|
|
|
|
2021-01-06 18:48:24 +00:00
|
|
|
bool IsPrefetchSupported(const std::shared_ptr<FileSystem>& fs,
|
|
|
|
const std::string& dir) {
|
|
|
|
bool supported = false;
|
|
|
|
std::string tmp = TempFileName(dir, 999);
|
|
|
|
Random rnd(301);
|
|
|
|
std::string test_string = rnd.RandomString(4096);
|
|
|
|
Slice data(test_string);
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
IOOptions opts;
|
|
|
|
Status s = WriteStringToFile(fs.get(), data, tmp, true, opts);
|
2021-01-06 18:48:24 +00:00
|
|
|
if (s.ok()) {
|
|
|
|
std::unique_ptr<FSRandomAccessFile> file;
|
|
|
|
auto io_s = fs->NewRandomAccessFile(tmp, FileOptions(), &file, nullptr);
|
|
|
|
if (io_s.ok()) {
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
supported =
|
|
|
|
!(file->Prefetch(0, data.size(), opts, nullptr).IsNotSupported());
|
2021-01-06 18:48:24 +00:00
|
|
|
}
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
s = fs->DeleteFile(tmp, opts, nullptr);
|
2021-01-06 18:48:24 +00:00
|
|
|
}
|
|
|
|
return s.ok() && supported;
|
|
|
|
}
|
|
|
|
|
2019-07-09 21:48:07 +00:00
|
|
|
size_t GetLinesCount(const std::string& fname, const std::string& pattern) {
|
|
|
|
std::stringstream ssbuf;
|
|
|
|
std::string line;
|
|
|
|
size_t count = 0;
|
|
|
|
|
|
|
|
std::ifstream inFile(fname.c_str());
|
|
|
|
ssbuf << inFile.rdbuf();
|
|
|
|
|
|
|
|
while (getline(ssbuf, line)) {
|
|
|
|
if (line.find(pattern) != std::string::npos) {
|
|
|
|
count++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return count;
|
|
|
|
}
|
|
|
|
|
Fix many tests to run with MEM_ENV and ENCRYPTED_ENV; Introduce a MemoryFileSystem class (#7566)
Summary:
This PR does a few things:
1. The MockFileSystem class was split out from the MockEnv. This change would theoretically allow a MockFileSystem to be used by other Environments as well (if we created a means of constructing one). The MockFileSystem implements a FileSystem in its entirety and does not rely on any Wrapper implementation.
2. Make the RocksDB test suite work when MOCK_ENV=1 and ENCRYPTED_ENV=1 are set. To accomplish this, a few things were needed:
- The tests that tried to use the "wrong" environment (Env::Default() instead of env_) were updated
- The MockFileSystem was changed to support the features it was missing or mishandled (such as recursively deleting files in a directory or supporting renaming of a directory).
3. Updated the test framework to have a ROCKSDB_GTEST_SKIP macro. This can be used to flag tests that are skipped. Currently, this defaults to doing nothing (marks the test as SUCCESS) but will mark the tests as SKIPPED when RocksDB is upgraded to a version of gtest that supports this (gtest-1.10).
I have run a full "make check" with MEM_ENV, ENCRYPTED_ENV, both, and neither under both MacOS and RedHat. A few tests were disabled/skipped for the MEM/ENCRYPTED cases. The error_handler_fs_test fails/hangs for MEM_ENV (presumably a timing problem) and I will introduce another PR/issue to track that problem. (I will also push a change to disable those tests soon). There is one more test in DBTest2 that also fails which I need to investigate or skip before this PR is merged.
Theoretically, this PR should also allow the test suite to run against an Env loaded from the registry, though I do not have one to try it with currently.
Finally, once this is accepted, it would be nice if there was a CircleCI job to run these tests on a checkin so this effort does not become stale. I do not know how to do that, so if someone could write that job, it would be appreciated :)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7566
Reviewed By: zhichao-cao
Differential Revision: D24408980
Pulled By: jay-zhuang
fbshipit-source-id: 911b1554a4d0da06fd51feca0c090a4abdcb4a5f
2020-10-27 17:31:34 +00:00
|
|
|
Status CorruptFile(Env* env, const std::string& fname, int offset,
|
|
|
|
int bytes_to_corrupt, bool verify_checksum /*=true*/) {
|
|
|
|
uint64_t size;
|
|
|
|
Status s = env->GetFileSize(fname, &size);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
} else if (offset < 0) {
|
2020-06-05 18:06:26 +00:00
|
|
|
// Relative to end of file; make it absolute
|
Fix many tests to run with MEM_ENV and ENCRYPTED_ENV; Introduce a MemoryFileSystem class (#7566)
Summary:
This PR does a few things:
1. The MockFileSystem class was split out from the MockEnv. This change would theoretically allow a MockFileSystem to be used by other Environments as well (if we created a means of constructing one). The MockFileSystem implements a FileSystem in its entirety and does not rely on any Wrapper implementation.
2. Make the RocksDB test suite work when MOCK_ENV=1 and ENCRYPTED_ENV=1 are set. To accomplish this, a few things were needed:
- The tests that tried to use the "wrong" environment (Env::Default() instead of env_) were updated
- The MockFileSystem was changed to support the features it was missing or mishandled (such as recursively deleting files in a directory or supporting renaming of a directory).
3. Updated the test framework to have a ROCKSDB_GTEST_SKIP macro. This can be used to flag tests that are skipped. Currently, this defaults to doing nothing (marks the test as SUCCESS) but will mark the tests as SKIPPED when RocksDB is upgraded to a version of gtest that supports this (gtest-1.10).
I have run a full "make check" with MEM_ENV, ENCRYPTED_ENV, both, and neither under both MacOS and RedHat. A few tests were disabled/skipped for the MEM/ENCRYPTED cases. The error_handler_fs_test fails/hangs for MEM_ENV (presumably a timing problem) and I will introduce another PR/issue to track that problem. (I will also push a change to disable those tests soon). There is one more test in DBTest2 that also fails which I need to investigate or skip before this PR is merged.
Theoretically, this PR should also allow the test suite to run against an Env loaded from the registry, though I do not have one to try it with currently.
Finally, once this is accepted, it would be nice if there was a CircleCI job to run these tests on a checkin so this effort does not become stale. I do not know how to do that, so if someone could write that job, it would be appreciated :)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7566
Reviewed By: zhichao-cao
Differential Revision: D24408980
Pulled By: jay-zhuang
fbshipit-source-id: 911b1554a4d0da06fd51feca0c090a4abdcb4a5f
2020-10-27 17:31:34 +00:00
|
|
|
if (-offset > static_cast<int>(size)) {
|
2020-06-05 18:06:26 +00:00
|
|
|
offset = 0;
|
|
|
|
} else {
|
Fix many tests to run with MEM_ENV and ENCRYPTED_ENV; Introduce a MemoryFileSystem class (#7566)
Summary:
This PR does a few things:
1. The MockFileSystem class was split out from the MockEnv. This change would theoretically allow a MockFileSystem to be used by other Environments as well (if we created a means of constructing one). The MockFileSystem implements a FileSystem in its entirety and does not rely on any Wrapper implementation.
2. Make the RocksDB test suite work when MOCK_ENV=1 and ENCRYPTED_ENV=1 are set. To accomplish this, a few things were needed:
- The tests that tried to use the "wrong" environment (Env::Default() instead of env_) were updated
- The MockFileSystem was changed to support the features it was missing or mishandled (such as recursively deleting files in a directory or supporting renaming of a directory).
3. Updated the test framework to have a ROCKSDB_GTEST_SKIP macro. This can be used to flag tests that are skipped. Currently, this defaults to doing nothing (marks the test as SUCCESS) but will mark the tests as SKIPPED when RocksDB is upgraded to a version of gtest that supports this (gtest-1.10).
I have run a full "make check" with MEM_ENV, ENCRYPTED_ENV, both, and neither under both MacOS and RedHat. A few tests were disabled/skipped for the MEM/ENCRYPTED cases. The error_handler_fs_test fails/hangs for MEM_ENV (presumably a timing problem) and I will introduce another PR/issue to track that problem. (I will also push a change to disable those tests soon). There is one more test in DBTest2 that also fails which I need to investigate or skip before this PR is merged.
Theoretically, this PR should also allow the test suite to run against an Env loaded from the registry, though I do not have one to try it with currently.
Finally, once this is accepted, it would be nice if there was a CircleCI job to run these tests on a checkin so this effort does not become stale. I do not know how to do that, so if someone could write that job, it would be appreciated :)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7566
Reviewed By: zhichao-cao
Differential Revision: D24408980
Pulled By: jay-zhuang
fbshipit-source-id: 911b1554a4d0da06fd51feca0c090a4abdcb4a5f
2020-10-27 17:31:34 +00:00
|
|
|
offset = static_cast<int>(size + offset);
|
2020-06-05 18:06:26 +00:00
|
|
|
}
|
|
|
|
}
|
Fix many tests to run with MEM_ENV and ENCRYPTED_ENV; Introduce a MemoryFileSystem class (#7566)
Summary:
This PR does a few things:
1. The MockFileSystem class was split out from the MockEnv. This change would theoretically allow a MockFileSystem to be used by other Environments as well (if we created a means of constructing one). The MockFileSystem implements a FileSystem in its entirety and does not rely on any Wrapper implementation.
2. Make the RocksDB test suite work when MOCK_ENV=1 and ENCRYPTED_ENV=1 are set. To accomplish this, a few things were needed:
- The tests that tried to use the "wrong" environment (Env::Default() instead of env_) were updated
- The MockFileSystem was changed to support the features it was missing or mishandled (such as recursively deleting files in a directory or supporting renaming of a directory).
3. Updated the test framework to have a ROCKSDB_GTEST_SKIP macro. This can be used to flag tests that are skipped. Currently, this defaults to doing nothing (marks the test as SUCCESS) but will mark the tests as SKIPPED when RocksDB is upgraded to a version of gtest that supports this (gtest-1.10).
I have run a full "make check" with MEM_ENV, ENCRYPTED_ENV, both, and neither under both MacOS and RedHat. A few tests were disabled/skipped for the MEM/ENCRYPTED cases. The error_handler_fs_test fails/hangs for MEM_ENV (presumably a timing problem) and I will introduce another PR/issue to track that problem. (I will also push a change to disable those tests soon). There is one more test in DBTest2 that also fails which I need to investigate or skip before this PR is merged.
Theoretically, this PR should also allow the test suite to run against an Env loaded from the registry, though I do not have one to try it with currently.
Finally, once this is accepted, it would be nice if there was a CircleCI job to run these tests on a checkin so this effort does not become stale. I do not know how to do that, so if someone could write that job, it would be appreciated :)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7566
Reviewed By: zhichao-cao
Differential Revision: D24408980
Pulled By: jay-zhuang
fbshipit-source-id: 911b1554a4d0da06fd51feca0c090a4abdcb4a5f
2020-10-27 17:31:34 +00:00
|
|
|
if (offset > static_cast<int>(size)) {
|
|
|
|
offset = static_cast<int>(size);
|
2020-06-05 18:06:26 +00:00
|
|
|
}
|
Fix many tests to run with MEM_ENV and ENCRYPTED_ENV; Introduce a MemoryFileSystem class (#7566)
Summary:
This PR does a few things:
1. The MockFileSystem class was split out from the MockEnv. This change would theoretically allow a MockFileSystem to be used by other Environments as well (if we created a means of constructing one). The MockFileSystem implements a FileSystem in its entirety and does not rely on any Wrapper implementation.
2. Make the RocksDB test suite work when MOCK_ENV=1 and ENCRYPTED_ENV=1 are set. To accomplish this, a few things were needed:
- The tests that tried to use the "wrong" environment (Env::Default() instead of env_) were updated
- The MockFileSystem was changed to support the features it was missing or mishandled (such as recursively deleting files in a directory or supporting renaming of a directory).
3. Updated the test framework to have a ROCKSDB_GTEST_SKIP macro. This can be used to flag tests that are skipped. Currently, this defaults to doing nothing (marks the test as SUCCESS) but will mark the tests as SKIPPED when RocksDB is upgraded to a version of gtest that supports this (gtest-1.10).
I have run a full "make check" with MEM_ENV, ENCRYPTED_ENV, both, and neither under both MacOS and RedHat. A few tests were disabled/skipped for the MEM/ENCRYPTED cases. The error_handler_fs_test fails/hangs for MEM_ENV (presumably a timing problem) and I will introduce another PR/issue to track that problem. (I will also push a change to disable those tests soon). There is one more test in DBTest2 that also fails which I need to investigate or skip before this PR is merged.
Theoretically, this PR should also allow the test suite to run against an Env loaded from the registry, though I do not have one to try it with currently.
Finally, once this is accepted, it would be nice if there was a CircleCI job to run these tests on a checkin so this effort does not become stale. I do not know how to do that, so if someone could write that job, it would be appreciated :)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7566
Reviewed By: zhichao-cao
Differential Revision: D24408980
Pulled By: jay-zhuang
fbshipit-source-id: 911b1554a4d0da06fd51feca0c090a4abdcb4a5f
2020-10-27 17:31:34 +00:00
|
|
|
if (offset + bytes_to_corrupt > static_cast<int>(size)) {
|
|
|
|
bytes_to_corrupt = static_cast<int>(size - offset);
|
2020-06-05 18:06:26 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Do it
|
|
|
|
std::string contents;
|
Fix many tests to run with MEM_ENV and ENCRYPTED_ENV; Introduce a MemoryFileSystem class (#7566)
Summary:
This PR does a few things:
1. The MockFileSystem class was split out from the MockEnv. This change would theoretically allow a MockFileSystem to be used by other Environments as well (if we created a means of constructing one). The MockFileSystem implements a FileSystem in its entirety and does not rely on any Wrapper implementation.
2. Make the RocksDB test suite work when MOCK_ENV=1 and ENCRYPTED_ENV=1 are set. To accomplish this, a few things were needed:
- The tests that tried to use the "wrong" environment (Env::Default() instead of env_) were updated
- The MockFileSystem was changed to support the features it was missing or mishandled (such as recursively deleting files in a directory or supporting renaming of a directory).
3. Updated the test framework to have a ROCKSDB_GTEST_SKIP macro. This can be used to flag tests that are skipped. Currently, this defaults to doing nothing (marks the test as SUCCESS) but will mark the tests as SKIPPED when RocksDB is upgraded to a version of gtest that supports this (gtest-1.10).
I have run a full "make check" with MEM_ENV, ENCRYPTED_ENV, both, and neither under both MacOS and RedHat. A few tests were disabled/skipped for the MEM/ENCRYPTED cases. The error_handler_fs_test fails/hangs for MEM_ENV (presumably a timing problem) and I will introduce another PR/issue to track that problem. (I will also push a change to disable those tests soon). There is one more test in DBTest2 that also fails which I need to investigate or skip before this PR is merged.
Theoretically, this PR should also allow the test suite to run against an Env loaded from the registry, though I do not have one to try it with currently.
Finally, once this is accepted, it would be nice if there was a CircleCI job to run these tests on a checkin so this effort does not become stale. I do not know how to do that, so if someone could write that job, it would be appreciated :)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7566
Reviewed By: zhichao-cao
Differential Revision: D24408980
Pulled By: jay-zhuang
fbshipit-source-id: 911b1554a4d0da06fd51feca0c090a4abdcb4a5f
2020-10-27 17:31:34 +00:00
|
|
|
s = ReadFileToString(env, fname, &contents);
|
|
|
|
if (s.ok()) {
|
|
|
|
for (int i = 0; i < bytes_to_corrupt; i++) {
|
|
|
|
contents[i + offset] ^= 0x80;
|
|
|
|
}
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
s = WriteStringToFile(env, contents, fname, false /* should_sync */);
|
2020-06-05 18:06:26 +00:00
|
|
|
}
|
Fix many tests to run with MEM_ENV and ENCRYPTED_ENV; Introduce a MemoryFileSystem class (#7566)
Summary:
This PR does a few things:
1. The MockFileSystem class was split out from the MockEnv. This change would theoretically allow a MockFileSystem to be used by other Environments as well (if we created a means of constructing one). The MockFileSystem implements a FileSystem in its entirety and does not rely on any Wrapper implementation.
2. Make the RocksDB test suite work when MOCK_ENV=1 and ENCRYPTED_ENV=1 are set. To accomplish this, a few things were needed:
- The tests that tried to use the "wrong" environment (Env::Default() instead of env_) were updated
- The MockFileSystem was changed to support the features it was missing or mishandled (such as recursively deleting files in a directory or supporting renaming of a directory).
3. Updated the test framework to have a ROCKSDB_GTEST_SKIP macro. This can be used to flag tests that are skipped. Currently, this defaults to doing nothing (marks the test as SUCCESS) but will mark the tests as SKIPPED when RocksDB is upgraded to a version of gtest that supports this (gtest-1.10).
I have run a full "make check" with MEM_ENV, ENCRYPTED_ENV, both, and neither under both MacOS and RedHat. A few tests were disabled/skipped for the MEM/ENCRYPTED cases. The error_handler_fs_test fails/hangs for MEM_ENV (presumably a timing problem) and I will introduce another PR/issue to track that problem. (I will also push a change to disable those tests soon). There is one more test in DBTest2 that also fails which I need to investigate or skip before this PR is merged.
Theoretically, this PR should also allow the test suite to run against an Env loaded from the registry, though I do not have one to try it with currently.
Finally, once this is accepted, it would be nice if there was a CircleCI job to run these tests on a checkin so this effort does not become stale. I do not know how to do that, so if someone could write that job, it would be appreciated :)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7566
Reviewed By: zhichao-cao
Differential Revision: D24408980
Pulled By: jay-zhuang
fbshipit-source-id: 911b1554a4d0da06fd51feca0c090a4abdcb4a5f
2020-10-27 17:31:34 +00:00
|
|
|
if (s.ok() && verify_checksum) {
|
|
|
|
Options options;
|
|
|
|
options.env = env;
|
|
|
|
EnvOptions env_options;
|
|
|
|
Status v = VerifySstFileChecksum(options, env_options, fname);
|
|
|
|
assert(!v.ok());
|
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status TruncateFile(Env* env, const std::string& fname, uint64_t new_length) {
|
|
|
|
uint64_t old_length;
|
|
|
|
Status s = env->GetFileSize(fname, &old_length);
|
|
|
|
if (!s.ok() || new_length == old_length) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
// Do it
|
|
|
|
std::string contents;
|
|
|
|
s = ReadFileToString(env, fname, &contents);
|
|
|
|
if (s.ok()) {
|
|
|
|
contents.resize(static_cast<size_t>(new_length), 'b');
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
s = WriteStringToFile(env, contents, fname, false /* should_sync */);
|
Fix many tests to run with MEM_ENV and ENCRYPTED_ENV; Introduce a MemoryFileSystem class (#7566)
Summary:
This PR does a few things:
1. The MockFileSystem class was split out from the MockEnv. This change would theoretically allow a MockFileSystem to be used by other Environments as well (if we created a means of constructing one). The MockFileSystem implements a FileSystem in its entirety and does not rely on any Wrapper implementation.
2. Make the RocksDB test suite work when MOCK_ENV=1 and ENCRYPTED_ENV=1 are set. To accomplish this, a few things were needed:
- The tests that tried to use the "wrong" environment (Env::Default() instead of env_) were updated
- The MockFileSystem was changed to support the features it was missing or mishandled (such as recursively deleting files in a directory or supporting renaming of a directory).
3. Updated the test framework to have a ROCKSDB_GTEST_SKIP macro. This can be used to flag tests that are skipped. Currently, this defaults to doing nothing (marks the test as SUCCESS) but will mark the tests as SKIPPED when RocksDB is upgraded to a version of gtest that supports this (gtest-1.10).
I have run a full "make check" with MEM_ENV, ENCRYPTED_ENV, both, and neither under both MacOS and RedHat. A few tests were disabled/skipped for the MEM/ENCRYPTED cases. The error_handler_fs_test fails/hangs for MEM_ENV (presumably a timing problem) and I will introduce another PR/issue to track that problem. (I will also push a change to disable those tests soon). There is one more test in DBTest2 that also fails which I need to investigate or skip before this PR is merged.
Theoretically, this PR should also allow the test suite to run against an Env loaded from the registry, though I do not have one to try it with currently.
Finally, once this is accepted, it would be nice if there was a CircleCI job to run these tests on a checkin so this effort does not become stale. I do not know how to do that, so if someone could write that job, it would be appreciated :)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7566
Reviewed By: zhichao-cao
Differential Revision: D24408980
Pulled By: jay-zhuang
fbshipit-source-id: 911b1554a4d0da06fd51feca0c090a4abdcb4a5f
2020-10-27 17:31:34 +00:00
|
|
|
}
|
|
|
|
return s;
|
2020-06-05 18:06:26 +00:00
|
|
|
}
|
|
|
|
|
2020-12-10 05:19:55 +00:00
|
|
|
// Try and delete a directory if it exists
|
|
|
|
Status TryDeleteDir(Env* env, const std::string& dirname) {
|
|
|
|
bool is_dir = false;
|
|
|
|
Status s = env->IsDirectory(dirname, &is_dir);
|
|
|
|
if (s.ok() && is_dir) {
|
|
|
|
s = env->DeleteDir(dirname);
|
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Delete a directory if it exists
|
|
|
|
void DeleteDir(Env* env, const std::string& dirname) {
|
|
|
|
TryDeleteDir(env, dirname).PermitUncheckedError();
|
|
|
|
}
|
|
|
|
|
2021-06-15 10:42:52 +00:00
|
|
|
Status CreateEnvFromSystem(const ConfigOptions& config_options, Env** result,
|
|
|
|
std::shared_ptr<Env>* guard) {
|
|
|
|
const char* env_uri = getenv("TEST_ENV_URI");
|
|
|
|
const char* fs_uri = getenv("TEST_FS_URI");
|
|
|
|
if (env_uri || fs_uri) {
|
|
|
|
return Env::CreateFromUri(config_options,
|
|
|
|
(env_uri != nullptr) ? env_uri : "",
|
|
|
|
(fs_uri != nullptr) ? fs_uri : "", result, guard);
|
|
|
|
} else {
|
|
|
|
// Neither specified. Use the default
|
|
|
|
*result = config_options.env;
|
|
|
|
guard->reset();
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
}
|
2021-09-08 14:45:59 +00:00
|
|
|
namespace {
|
|
|
|
// A hacky skip list mem table that triggers flush after number of entries.
|
|
|
|
class SpecialMemTableRep : public MemTableRep {
|
|
|
|
public:
|
|
|
|
explicit SpecialMemTableRep(Allocator* allocator, MemTableRep* memtable,
|
|
|
|
int num_entries_flush)
|
|
|
|
: MemTableRep(allocator),
|
|
|
|
memtable_(memtable),
|
|
|
|
num_entries_flush_(num_entries_flush),
|
|
|
|
num_entries_(0) {}
|
|
|
|
|
|
|
|
virtual KeyHandle Allocate(const size_t len, char** buf) override {
|
|
|
|
return memtable_->Allocate(len, buf);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Insert key into the list.
|
|
|
|
// REQUIRES: nothing that compares equal to key is currently in the list.
|
|
|
|
virtual void Insert(KeyHandle handle) override {
|
|
|
|
num_entries_++;
|
|
|
|
memtable_->Insert(handle);
|
|
|
|
}
|
|
|
|
|
|
|
|
void InsertConcurrently(KeyHandle handle) override {
|
|
|
|
num_entries_++;
|
|
|
|
memtable_->Insert(handle);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Returns true iff an entry that compares equal to key is in the list.
|
|
|
|
virtual bool Contains(const char* key) const override {
|
|
|
|
return memtable_->Contains(key);
|
|
|
|
}
|
|
|
|
|
|
|
|
virtual size_t ApproximateMemoryUsage() override {
|
|
|
|
// Return a high memory usage when number of entries exceeds the threshold
|
|
|
|
// to trigger a flush.
|
|
|
|
return (num_entries_ < num_entries_flush_) ? 0 : 1024 * 1024 * 1024;
|
|
|
|
}
|
|
|
|
|
|
|
|
virtual void Get(const LookupKey& k, void* callback_args,
|
|
|
|
bool (*callback_func)(void* arg,
|
|
|
|
const char* entry)) override {
|
|
|
|
memtable_->Get(k, callback_args, callback_func);
|
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t ApproximateNumEntries(const Slice& start_ikey,
|
|
|
|
const Slice& end_ikey) override {
|
|
|
|
return memtable_->ApproximateNumEntries(start_ikey, end_ikey);
|
|
|
|
}
|
|
|
|
|
|
|
|
virtual MemTableRep::Iterator* GetIterator(Arena* arena = nullptr) override {
|
|
|
|
return memtable_->GetIterator(arena);
|
|
|
|
}
|
|
|
|
|
2023-12-01 19:15:17 +00:00
|
|
|
virtual ~SpecialMemTableRep() override = default;
|
2021-09-08 14:45:59 +00:00
|
|
|
|
|
|
|
private:
|
|
|
|
std::unique_ptr<MemTableRep> memtable_;
|
|
|
|
int num_entries_flush_;
|
|
|
|
int num_entries_;
|
|
|
|
};
|
|
|
|
class SpecialSkipListFactory : public MemTableRepFactory {
|
|
|
|
public:
|
2021-12-29 15:55:17 +00:00
|
|
|
static bool Register(ObjectLibrary& library, const std::string& /*arg*/) {
|
2022-01-11 14:32:42 +00:00
|
|
|
library.AddFactory<MemTableRepFactory>(
|
2021-12-29 15:55:17 +00:00
|
|
|
ObjectLibrary::PatternEntry(SpecialSkipListFactory::kClassName(), true)
|
|
|
|
.AddNumber(":"),
|
|
|
|
[](const std::string& uri, std::unique_ptr<MemTableRepFactory>* guard,
|
|
|
|
std::string* /* errmsg */) {
|
2023-12-01 19:15:17 +00:00
|
|
|
auto colon = uri.find(':');
|
2021-12-29 15:55:17 +00:00
|
|
|
if (colon != std::string::npos) {
|
|
|
|
auto count = ParseInt(uri.substr(colon + 1));
|
|
|
|
guard->reset(new SpecialSkipListFactory(count));
|
|
|
|
} else {
|
|
|
|
guard->reset(new SpecialSkipListFactory(2));
|
|
|
|
}
|
|
|
|
return guard->get();
|
|
|
|
});
|
|
|
|
return true;
|
|
|
|
}
|
2023-04-14 18:50:20 +00:00
|
|
|
// After number of inserts >= `num_entries_flush` in a mem table, trigger
|
2021-09-08 14:45:59 +00:00
|
|
|
// flush.
|
|
|
|
explicit SpecialSkipListFactory(int num_entries_flush)
|
|
|
|
: num_entries_flush_(num_entries_flush) {}
|
|
|
|
|
|
|
|
using MemTableRepFactory::CreateMemTableRep;
|
|
|
|
virtual MemTableRep* CreateMemTableRep(
|
|
|
|
const MemTableRep::KeyComparator& compare, Allocator* allocator,
|
|
|
|
const SliceTransform* transform, Logger* /*logger*/) override {
|
|
|
|
return new SpecialMemTableRep(
|
|
|
|
allocator,
|
|
|
|
factory_.CreateMemTableRep(compare, allocator, transform, nullptr),
|
|
|
|
num_entries_flush_);
|
|
|
|
}
|
|
|
|
static const char* kClassName() { return "SpecialSkipListFactory"; }
|
|
|
|
virtual const char* Name() const override { return kClassName(); }
|
|
|
|
std::string GetId() const override {
|
|
|
|
std::string id = Name();
|
|
|
|
if (num_entries_flush_ > 0) {
|
2022-05-06 20:03:58 +00:00
|
|
|
id.append(":").append(std::to_string(num_entries_flush_));
|
2021-09-08 14:45:59 +00:00
|
|
|
}
|
|
|
|
return id;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool IsInsertConcurrentlySupported() const override {
|
|
|
|
return factory_.IsInsertConcurrentlySupported();
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
SkipListFactory factory_;
|
|
|
|
int num_entries_flush_;
|
|
|
|
};
|
|
|
|
} // namespace
|
|
|
|
|
|
|
|
MemTableRepFactory* NewSpecialSkipListFactory(int num_entries_per_flush) {
|
|
|
|
RegisterTestLibrary();
|
|
|
|
return new SpecialSkipListFactory(num_entries_per_flush);
|
|
|
|
}
|
2021-06-15 10:42:52 +00:00
|
|
|
|
2021-08-19 17:09:30 +00:00
|
|
|
// This method loads existing test classes into the ObjectRegistry
|
2021-12-29 15:55:17 +00:00
|
|
|
int RegisterTestObjects(ObjectLibrary& library, const std::string& arg) {
|
2021-08-19 17:09:30 +00:00
|
|
|
size_t num_types;
|
2022-01-11 14:32:42 +00:00
|
|
|
library.AddFactory<const Comparator>(
|
2021-08-19 17:09:30 +00:00
|
|
|
test::SimpleSuffixReverseComparator::kClassName(),
|
|
|
|
[](const std::string& /*uri*/,
|
|
|
|
std::unique_ptr<const Comparator>* /*guard*/,
|
|
|
|
std::string* /* errmsg */) {
|
|
|
|
static test::SimpleSuffixReverseComparator ssrc;
|
|
|
|
return &ssrc;
|
|
|
|
});
|
2021-12-29 15:55:17 +00:00
|
|
|
SpecialSkipListFactory::Register(library, arg);
|
2022-01-11 14:32:42 +00:00
|
|
|
library.AddFactory<MergeOperator>(
|
2021-08-19 17:09:30 +00:00
|
|
|
"Changling",
|
|
|
|
[](const std::string& uri, std::unique_ptr<MergeOperator>* guard,
|
|
|
|
std::string* /* errmsg */) {
|
|
|
|
guard->reset(new test::ChanglingMergeOperator(uri));
|
|
|
|
return guard->get();
|
|
|
|
});
|
2022-01-11 14:32:42 +00:00
|
|
|
library.AddFactory<CompactionFilter>(
|
2021-08-19 17:09:30 +00:00
|
|
|
"Changling",
|
|
|
|
[](const std::string& uri, std::unique_ptr<CompactionFilter>* /*guard*/,
|
|
|
|
std::string* /* errmsg */) {
|
|
|
|
return new test::ChanglingCompactionFilter(uri);
|
|
|
|
});
|
2022-01-11 14:32:42 +00:00
|
|
|
library.AddFactory<CompactionFilterFactory>(
|
2021-08-19 17:09:30 +00:00
|
|
|
"Changling", [](const std::string& uri,
|
|
|
|
std::unique_ptr<CompactionFilterFactory>* guard,
|
|
|
|
std::string* /* errmsg */) {
|
|
|
|
guard->reset(new test::ChanglingCompactionFilterFactory(uri));
|
|
|
|
return guard->get();
|
|
|
|
});
|
2022-01-11 14:32:42 +00:00
|
|
|
library.AddFactory<SystemClock>(
|
2021-09-21 15:53:03 +00:00
|
|
|
MockSystemClock::kClassName(),
|
|
|
|
[](const std::string& /*uri*/, std::unique_ptr<SystemClock>* guard,
|
|
|
|
std::string* /* errmsg */) {
|
|
|
|
guard->reset(new MockSystemClock(SystemClock::Default()));
|
|
|
|
return guard->get();
|
|
|
|
});
|
2021-08-19 17:09:30 +00:00
|
|
|
return static_cast<int>(library.GetFactoryCount(&num_types));
|
|
|
|
}
|
2021-09-08 14:45:59 +00:00
|
|
|
|
|
|
|
|
|
|
|
void RegisterTestLibrary(const std::string& arg) {
|
|
|
|
static bool registered = false;
|
|
|
|
if (!registered) {
|
|
|
|
registered = true;
|
|
|
|
ObjectRegistry::Default()->AddLibrary("test", RegisterTestObjects, arg);
|
|
|
|
}
|
|
|
|
}
|
2011-10-31 17:22:06 +00:00
|
|
|
} // namespace test
|
2020-02-20 20:07:53 +00:00
|
|
|
} // namespace ROCKSDB_NAMESPACE
|