2016-02-09 23:12:00 +00:00
|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
2017-07-15 23:03:42 +00:00
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
2013-10-16 21:59:46 +00:00
|
|
|
//
|
2011-06-22 18:45:39 +00:00
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
|
|
|
|
#include "db/version_set.h"
|
Fix many tests to run with MEM_ENV and ENCRYPTED_ENV; Introduce a MemoryFileSystem class (#7566)
Summary:
This PR does a few things:
1. The MockFileSystem class was split out from the MockEnv. This change would theoretically allow a MockFileSystem to be used by other Environments as well (if we created a means of constructing one). The MockFileSystem implements a FileSystem in its entirety and does not rely on any Wrapper implementation.
2. Make the RocksDB test suite work when MOCK_ENV=1 and ENCRYPTED_ENV=1 are set. To accomplish this, a few things were needed:
- The tests that tried to use the "wrong" environment (Env::Default() instead of env_) were updated
- The MockFileSystem was changed to support the features it was missing or mishandled (such as recursively deleting files in a directory or supporting renaming of a directory).
3. Updated the test framework to have a ROCKSDB_GTEST_SKIP macro. This can be used to flag tests that are skipped. Currently, this defaults to doing nothing (marks the test as SUCCESS) but will mark the tests as SKIPPED when RocksDB is upgraded to a version of gtest that supports this (gtest-1.10).
I have run a full "make check" with MEM_ENV, ENCRYPTED_ENV, both, and neither under both MacOS and RedHat. A few tests were disabled/skipped for the MEM/ENCRYPTED cases. The error_handler_fs_test fails/hangs for MEM_ENV (presumably a timing problem) and I will introduce another PR/issue to track that problem. (I will also push a change to disable those tests soon). There is one more test in DBTest2 that also fails which I need to investigate or skip before this PR is merged.
Theoretically, this PR should also allow the test suite to run against an Env loaded from the registry, though I do not have one to try it with currently.
Finally, once this is accepted, it would be nice if there was a CircleCI job to run these tests on a checkin so this effort does not become stale. I do not know how to do that, so if someone could write that job, it would be appreciated :)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7566
Reviewed By: zhichao-cao
Differential Revision: D24408980
Pulled By: jay-zhuang
fbshipit-source-id: 911b1554a4d0da06fd51feca0c090a4abdcb4a5f
2020-10-27 17:31:34 +00:00
|
|
|
|
Make it possible to force the garbage collection of the oldest blob files (#8994)
Summary:
The current BlobDB garbage collection logic works by relocating the valid
blobs from the oldest blob files as they are encountered during compaction,
and cleaning up blob files once they contain nothing but garbage. However,
with sufficiently skewed workloads, it is theoretically possible to end up in a
situation when few or no compactions get scheduled for the SST files that contain
references to the oldest blob files, which can lead to increased space amp due
to the lack of GC.
In order to efficiently handle such workloads, the patch adds a new BlobDB
configuration option called `blob_garbage_collection_force_threshold`,
which signals to BlobDB to schedule targeted compactions for the SST files
that keep alive the oldest batch of blob files if the overall ratio of garbage in
the given blob files meets the threshold *and* all the given blob files are
eligible for GC based on `blob_garbage_collection_age_cutoff`. (For example,
if the new option is set to 0.9, targeted compactions will get scheduled if the
sum of garbage bytes meets or exceeds 90% of the sum of total bytes in the
oldest blob files, assuming all affected blob files are below the age-based cutoff.)
The net result of these targeted compactions is that the valid blobs in the oldest
blob files are relocated and the oldest blob files themselves cleaned up (since
*all* SST files that rely on them get compacted away).
These targeted compactions are similar to periodic compactions in the sense
that they force certain SST files that otherwise would not get picked up to undergo
compaction and also in the sense that instead of merging files from multiple levels,
they target a single file. (Note: such compactions might still include neighboring files
from the same level due to the need of having a "clean cut" boundary but they never
include any files from any other level.)
This functionality is currently only supported with the leveled compaction style
and is inactive by default (since the default value is set to 1.0, i.e. 100%).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8994
Test Plan: Ran `make check` and tested using `db_bench` and the stress/crash tests.
Reviewed By: riversand963
Differential Revision: D31489850
Pulled By: ltamasi
fbshipit-source-id: 44057d511726a0e2a03c5d9313d7511b3f0c4eab
2021-10-12 01:00:44 +00:00
|
|
|
#include <algorithm>
|
|
|
|
|
2024-08-17 00:18:54 +00:00
|
|
|
#include "db/blob/blob_log_writer.h"
|
2019-09-03 15:50:47 +00:00
|
|
|
#include "db/db_impl/db_impl.h"
|
Account memory of FileMetaData in global memory limit (#9924)
Summary:
**Context/Summary:**
As revealed by heap profiling, allocation of `FileMetaData` for [newly created file added to a Version](https://github.com/facebook/rocksdb/pull/9924/files#diff-a6aa385940793f95a2c5b39cc670bd440c4547fa54fd44622f756382d5e47e43R774) can consume significant heap memory. This PR is to account that toward our global memory limit based on block cache capacity.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9924
Test Plan:
- Previous `make check` verified there are only 2 places where the memory of the allocated `FileMetaData` can be released
- New unit test `TEST_P(ChargeFileMetadataTestWithParam, Basic)`
- db bench (CPU cost of `charge_file_metadata` in write and compact)
- **write micros/op: -0.24%** : `TEST_TMPDIR=/dev/shm/testdb ./db_bench -benchmarks=fillseq -db=$TEST_TMPDIR -charge_file_metadata=1 (remove this option for pre-PR) -disable_auto_compactions=1 -write_buffer_size=100000 -num=4000000 | egrep 'fillseq'`
- **compact micros/op -0.87%** : `TEST_TMPDIR=/dev/shm/testdb ./db_bench -benchmarks=fillseq -db=$TEST_TMPDIR -charge_file_metadata=1 -disable_auto_compactions=1 -write_buffer_size=100000 -num=4000000 -numdistinct=1000 && ./db_bench -benchmarks=compact -db=$TEST_TMPDIR -use_existing_db=1 -charge_file_metadata=1 -disable_auto_compactions=1 | egrep 'compact'`
table 1 - write
#-run | (pre-PR) avg micros/op | std micros/op | (post-PR) micros/op | std micros/op | change (%)
-- | -- | -- | -- | -- | --
10 | 3.9711 | 0.264408 | 3.9914 | 0.254563 | 0.5111933721
20 | 3.83905 | 0.0664488 | 3.8251 | 0.0695456 | -0.3633711465
40 | 3.86625 | 0.136669 | 3.8867 | 0.143765 | 0.5289363078
80 | 3.87828 | 0.119007 | 3.86791 | 0.115674 | **-0.2673865734**
160 | 3.87677 | 0.162231 | 3.86739 | 0.16663 | **-0.2419539978**
table 2 - compact
#-run | (pre-PR) avg micros/op | std micros/op | (post-PR) micros/op | std micros/op | change (%)
-- | -- | -- | -- | -- | --
10 | 2,399,650.00 | 96,375.80 | 2,359,537.00 | 53,243.60 | -1.67
20 | 2,410,480.00 | 89,988.00 | 2,433,580.00 | 91,121.20 | 0.96
40 | 2.41E+06 | 121811 | 2.39E+06 | 131525 | **-0.96**
80 | 2.40E+06 | 134503 | 2.39E+06 | 108799 | **-0.78**
- stress test: `python3 tools/db_crashtest.py blackbox --charge_file_metadata=1 --cache_size=1` killed as normal
Reviewed By: ajkr
Differential Revision: D36055583
Pulled By: hx235
fbshipit-source-id: b60eab94707103cb1322cf815f05810ef0232625
2022-06-14 20:06:40 +00:00
|
|
|
#include "db/db_test_util.h"
|
2018-06-28 19:16:10 +00:00
|
|
|
#include "db/log_writer.h"
|
Sort L0 files by newly introduced epoch_num (#10922)
Summary:
**Context:**
Sorting L0 files by `largest_seqno` has at least two inconvenience:
- File ingestion and compaction involving ingested files can create files of overlapping seqno range with the existing files. `force_consistency_check=true` will catch such overlap seqno range even those harmless overlap.
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n")
- insert k1@1 to memtable m1
- ingest file s1 with k2@2, ingest file s2 with k3@3
- insert k4@4 to m1
- compact files s1, s2 and result in new file s3 of seqno range [2, 3]
- flush m1 and result in new file s4 of seqno range [1, 4]. And `force_consistency_check=true` will think s4 and s3 has file reordering corruption that might cause retuning an old value of k1
- However such caught corruption is a false positive since s1, s2 will not have overlapped keys with k1 or whatever inserted into m1 before ingest file s1 by the requirement of file ingestion (otherwise the m1 will be flushed first before any of the file ingestion completes). Therefore there in fact isn't any file reordering corruption.
- Single delete can decrease a file's largest seqno and ordering by `largest_seqno` can introduce a wrong ordering hence file reordering corruption
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n", Credit to ajkr for this example)
- an existing SST s1 contains only k1@1
- insert k1@2 to memtable m1
- ingest file s2 with k3@3, ingest file s3 with k4@4
- insert single delete k5@5 in m1
- flush m1 and result in new file s4 of seqno range [2, 5]
- compact s1, s2, s3 and result in new file s5 of seqno range [1, 4]
- compact s4 and result in new file s6 of seqno range [2] due to single delete
- By the last step, we have file ordering by largest seqno (">" means "newer") : s5 > s6 while s6 contains a newer version of the k1's value (i.e, k1@2) than s5, which is a real reordering corruption. While this can be caught by `force_consistency_check=true`, there isn't a good way to prevent this from happening if ordering by `largest_seqno`
Therefore, we are redesigning the sorting criteria of L0 files and avoid above inconvenience. Credit to ajkr , we now introduce `epoch_num` which describes the order of a file being flushed or ingested/imported (compaction output file will has the minimum `epoch_num` among input files'). This will avoid the above inconvenience in the following ways:
- In the first case above, there will no longer be overlap seqno range check in `force_consistency_check=true` but `epoch_number` ordering check. This will result in file ordering s1 < s2 < s4 (pre-compaction) and s3 < s4 (post-compaction) which won't trigger false positive corruption. See test class `DBCompactionTestL0FilesMisorderCorruption*` for more.
- In the second case above, this will result in file ordering s1 < s2 < s3 < s4 (pre-compacting s1, s2, s3), s5 < s4 (post-compacting s1, s2, s3), s5 < s6 (post-compacting s4), which are correct file ordering without causing any corruption.
**Summary:**
- Introduce `epoch_number` stored per `ColumnFamilyData` and sort CF's L0 files by their assigned `epoch_number` instead of `largest_seqno`.
- `epoch_number` is increased and assigned upon `VersionEdit::AddFile()` for flush (or similarly for WriteLevel0TableForRecovery) and file ingestion (except for allow_behind_true, which will always get assigned as the `kReservedEpochNumberForFileIngestedBehind`)
- Compaction output file is assigned with the minimum `epoch_number` among input files'
- Refit level: reuse refitted file's epoch_number
- Other paths needing `epoch_number` treatment:
- Import column families: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`
- Repair: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`.
- Assigning new epoch_number to a file and adding this file to LSM tree should be atomic. This is guaranteed by us assigning epoch_number right upon `VersionEdit::AddFile()` where this version edit will be apply to LSM tree shape right after by holding the db mutex (e.g, flush, file ingestion, import column family) or by there is only 1 ongoing edit per CF (e.g, WriteLevel0TableForRecovery, Repair).
- Assigning the minimum input epoch number to compaction output file won't misorder L0 files (even through later `Refit(target_level=0)`). It's due to for every key "k" in the input range, a legit compaction will cover a continuous epoch number range of that key. As long as we assign the key "k" the minimum input epoch number, it won't become newer or older than the versions of this key that aren't included in this compaction hence no misorder.
- Persist `epoch_number` of each file in manifest and recover `epoch_number` on db recovery
- Backward compatibility with old db without `epoch_number` support is guaranteed by assigning `epoch_number` to recovered files by `NewestFirstBySeqno` order. See `VersionStorageInfo::RecoverEpochNumbers()` for more
- Forward compatibility with manifest is guaranteed by flexibility of `NewFileCustomTag`
- Replace `force_consistent_check` on L0 with `epoch_number` and remove false positive check like case 1 with `largest_seqno` above
- Due to backward compatibility issue, we might encounter files with missing epoch number at the beginning of db recovery. We will still use old L0 sorting mechanism (`NewestFirstBySeqno`) to check/sort them till we infer their epoch number. See usages of `EpochNumberRequirement`.
- Remove fix https://github.com/facebook/rocksdb/pull/5958#issue-511150930 and their outdated tests to file reordering corruption because such fix can be replaced by this PR.
- Misc:
- update existing tests with `epoch_number` so make check will pass
- update https://github.com/facebook/rocksdb/pull/5958#issue-511150930 tests to verify corruption is fixed using `epoch_number` and cover universal/fifo compaction/CompactRange/CompactFile cases
- assert db_mutex is held for a few places before calling ColumnFamilyData::NewEpochNumber()
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10922
Test Plan:
- `make check`
- New unit tests under `db/db_compaction_test.cc`, `db/db_test2.cc`, `db/version_builder_test.cc`, `db/repair_test.cc`
- Updated tests (i.e, `DBCompactionTestL0FilesMisorderCorruption*`) under https://github.com/facebook/rocksdb/pull/5958#issue-511150930
- [Ongoing] Compatibility test: manually run https://github.com/ajkr/rocksdb/commit/36a5686ec012f35a4371e409aa85c404ca1c210d (with file ingestion off for running the `.orig` binary to prevent this bug affecting upgrade/downgrade formality checking) for 1 hour on `simple black/white box`, `cf_consistency/txn/enable_ts with whitebox + test_best_efforts_recovery with blackbox`
- [Ongoing] normal db stress test
- [Ongoing] db stress test with aggressive value https://github.com/facebook/rocksdb/pull/10761
Reviewed By: ajkr
Differential Revision: D41063187
Pulled By: hx235
fbshipit-source-id: 826cb23455de7beaabe2d16c57682a82733a32a9
2022-12-13 21:29:37 +00:00
|
|
|
#include "db/version_edit.h"
|
2021-12-03 22:42:05 +00:00
|
|
|
#include "rocksdb/advanced_options.h"
|
2021-06-15 10:42:52 +00:00
|
|
|
#include "rocksdb/convenience.h"
|
2021-01-29 06:08:46 +00:00
|
|
|
#include "rocksdb/file_system.h"
|
Fix many tests to run with MEM_ENV and ENCRYPTED_ENV; Introduce a MemoryFileSystem class (#7566)
Summary:
This PR does a few things:
1. The MockFileSystem class was split out from the MockEnv. This change would theoretically allow a MockFileSystem to be used by other Environments as well (if we created a means of constructing one). The MockFileSystem implements a FileSystem in its entirety and does not rely on any Wrapper implementation.
2. Make the RocksDB test suite work when MOCK_ENV=1 and ENCRYPTED_ENV=1 are set. To accomplish this, a few things were needed:
- The tests that tried to use the "wrong" environment (Env::Default() instead of env_) were updated
- The MockFileSystem was changed to support the features it was missing or mishandled (such as recursively deleting files in a directory or supporting renaming of a directory).
3. Updated the test framework to have a ROCKSDB_GTEST_SKIP macro. This can be used to flag tests that are skipped. Currently, this defaults to doing nothing (marks the test as SUCCESS) but will mark the tests as SKIPPED when RocksDB is upgraded to a version of gtest that supports this (gtest-1.10).
I have run a full "make check" with MEM_ENV, ENCRYPTED_ENV, both, and neither under both MacOS and RedHat. A few tests were disabled/skipped for the MEM/ENCRYPTED cases. The error_handler_fs_test fails/hangs for MEM_ENV (presumably a timing problem) and I will introduce another PR/issue to track that problem. (I will also push a change to disable those tests soon). There is one more test in DBTest2 that also fails which I need to investigate or skip before this PR is merged.
Theoretically, this PR should also allow the test suite to run against an Env loaded from the registry, though I do not have one to try it with currently.
Finally, once this is accepted, it would be nice if there was a CircleCI job to run these tests on a checkin so this effort does not become stale. I do not know how to do that, so if someone could write that job, it would be appreciated :)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7566
Reviewed By: zhichao-cao
Differential Revision: D24408980
Pulled By: jay-zhuang
fbshipit-source-id: 911b1554a4d0da06fd51feca0c090a4abdcb4a5f
2020-10-27 17:31:34 +00:00
|
|
|
#include "table/block_based/block_based_table_factory.h"
|
2018-06-28 19:16:10 +00:00
|
|
|
#include "table/mock_table.h"
|
2022-05-19 18:04:21 +00:00
|
|
|
#include "table/unique_id_impl.h"
|
2023-10-27 22:56:48 +00:00
|
|
|
#include "test_util/mock_time_env.h"
|
2019-05-30 18:21:38 +00:00
|
|
|
#include "test_util/testharness.h"
|
|
|
|
#include "test_util/testutil.h"
|
2019-05-31 00:39:43 +00:00
|
|
|
#include "util/string_util.h"
|
2011-06-22 18:45:39 +00:00
|
|
|
|
2020-02-20 20:07:53 +00:00
|
|
|
namespace ROCKSDB_NAMESPACE {
|
2011-06-22 18:45:39 +00:00
|
|
|
|
2015-03-17 21:08:00 +00:00
|
|
|
class GenerateLevelFilesBriefTest : public testing::Test {
|
2011-06-22 18:45:39 +00:00
|
|
|
public:
|
|
|
|
std::vector<FileMetaData*> files_;
|
2014-10-28 17:03:13 +00:00
|
|
|
LevelFilesBrief file_level_;
|
2014-07-11 19:52:41 +00:00
|
|
|
Arena arena_;
|
2011-10-05 23:30:28 +00:00
|
|
|
|
2023-12-04 19:17:32 +00:00
|
|
|
GenerateLevelFilesBriefTest() = default;
|
2011-06-22 18:45:39 +00:00
|
|
|
|
2019-02-19 21:36:04 +00:00
|
|
|
~GenerateLevelFilesBriefTest() override {
|
2015-12-15 23:26:20 +00:00
|
|
|
for (size_t i = 0; i < files_.size(); i++) {
|
2011-06-22 18:45:39 +00:00
|
|
|
delete files_[i];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-07-15 00:20:57 +00:00
|
|
|
void Add(const char* smallest, const char* largest,
|
|
|
|
SequenceNumber smallest_seq = 100,
|
|
|
|
SequenceNumber largest_seq = 100) {
|
2019-10-14 22:19:31 +00:00
|
|
|
FileMetaData* f = new FileMetaData(
|
|
|
|
files_.size() + 1, 0, 0,
|
|
|
|
InternalKey(smallest, smallest_seq, kTypeValue),
|
|
|
|
InternalKey(largest, largest_seq, kTypeValue), smallest_seq,
|
2021-12-03 22:42:05 +00:00
|
|
|
largest_seq, /* marked_for_compact */ false, Temperature::kUnknown,
|
|
|
|
kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
|
Sort L0 files by newly introduced epoch_num (#10922)
Summary:
**Context:**
Sorting L0 files by `largest_seqno` has at least two inconvenience:
- File ingestion and compaction involving ingested files can create files of overlapping seqno range with the existing files. `force_consistency_check=true` will catch such overlap seqno range even those harmless overlap.
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n")
- insert k1@1 to memtable m1
- ingest file s1 with k2@2, ingest file s2 with k3@3
- insert k4@4 to m1
- compact files s1, s2 and result in new file s3 of seqno range [2, 3]
- flush m1 and result in new file s4 of seqno range [1, 4]. And `force_consistency_check=true` will think s4 and s3 has file reordering corruption that might cause retuning an old value of k1
- However such caught corruption is a false positive since s1, s2 will not have overlapped keys with k1 or whatever inserted into m1 before ingest file s1 by the requirement of file ingestion (otherwise the m1 will be flushed first before any of the file ingestion completes). Therefore there in fact isn't any file reordering corruption.
- Single delete can decrease a file's largest seqno and ordering by `largest_seqno` can introduce a wrong ordering hence file reordering corruption
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n", Credit to ajkr for this example)
- an existing SST s1 contains only k1@1
- insert k1@2 to memtable m1
- ingest file s2 with k3@3, ingest file s3 with k4@4
- insert single delete k5@5 in m1
- flush m1 and result in new file s4 of seqno range [2, 5]
- compact s1, s2, s3 and result in new file s5 of seqno range [1, 4]
- compact s4 and result in new file s6 of seqno range [2] due to single delete
- By the last step, we have file ordering by largest seqno (">" means "newer") : s5 > s6 while s6 contains a newer version of the k1's value (i.e, k1@2) than s5, which is a real reordering corruption. While this can be caught by `force_consistency_check=true`, there isn't a good way to prevent this from happening if ordering by `largest_seqno`
Therefore, we are redesigning the sorting criteria of L0 files and avoid above inconvenience. Credit to ajkr , we now introduce `epoch_num` which describes the order of a file being flushed or ingested/imported (compaction output file will has the minimum `epoch_num` among input files'). This will avoid the above inconvenience in the following ways:
- In the first case above, there will no longer be overlap seqno range check in `force_consistency_check=true` but `epoch_number` ordering check. This will result in file ordering s1 < s2 < s4 (pre-compaction) and s3 < s4 (post-compaction) which won't trigger false positive corruption. See test class `DBCompactionTestL0FilesMisorderCorruption*` for more.
- In the second case above, this will result in file ordering s1 < s2 < s3 < s4 (pre-compacting s1, s2, s3), s5 < s4 (post-compacting s1, s2, s3), s5 < s6 (post-compacting s4), which are correct file ordering without causing any corruption.
**Summary:**
- Introduce `epoch_number` stored per `ColumnFamilyData` and sort CF's L0 files by their assigned `epoch_number` instead of `largest_seqno`.
- `epoch_number` is increased and assigned upon `VersionEdit::AddFile()` for flush (or similarly for WriteLevel0TableForRecovery) and file ingestion (except for allow_behind_true, which will always get assigned as the `kReservedEpochNumberForFileIngestedBehind`)
- Compaction output file is assigned with the minimum `epoch_number` among input files'
- Refit level: reuse refitted file's epoch_number
- Other paths needing `epoch_number` treatment:
- Import column families: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`
- Repair: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`.
- Assigning new epoch_number to a file and adding this file to LSM tree should be atomic. This is guaranteed by us assigning epoch_number right upon `VersionEdit::AddFile()` where this version edit will be apply to LSM tree shape right after by holding the db mutex (e.g, flush, file ingestion, import column family) or by there is only 1 ongoing edit per CF (e.g, WriteLevel0TableForRecovery, Repair).
- Assigning the minimum input epoch number to compaction output file won't misorder L0 files (even through later `Refit(target_level=0)`). It's due to for every key "k" in the input range, a legit compaction will cover a continuous epoch number range of that key. As long as we assign the key "k" the minimum input epoch number, it won't become newer or older than the versions of this key that aren't included in this compaction hence no misorder.
- Persist `epoch_number` of each file in manifest and recover `epoch_number` on db recovery
- Backward compatibility with old db without `epoch_number` support is guaranteed by assigning `epoch_number` to recovered files by `NewestFirstBySeqno` order. See `VersionStorageInfo::RecoverEpochNumbers()` for more
- Forward compatibility with manifest is guaranteed by flexibility of `NewFileCustomTag`
- Replace `force_consistent_check` on L0 with `epoch_number` and remove false positive check like case 1 with `largest_seqno` above
- Due to backward compatibility issue, we might encounter files with missing epoch number at the beginning of db recovery. We will still use old L0 sorting mechanism (`NewestFirstBySeqno`) to check/sort them till we infer their epoch number. See usages of `EpochNumberRequirement`.
- Remove fix https://github.com/facebook/rocksdb/pull/5958#issue-511150930 and their outdated tests to file reordering corruption because such fix can be replaced by this PR.
- Misc:
- update existing tests with `epoch_number` so make check will pass
- update https://github.com/facebook/rocksdb/pull/5958#issue-511150930 tests to verify corruption is fixed using `epoch_number` and cover universal/fifo compaction/CompactRange/CompactFile cases
- assert db_mutex is held for a few places before calling ColumnFamilyData::NewEpochNumber()
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10922
Test Plan:
- `make check`
- New unit tests under `db/db_compaction_test.cc`, `db/db_test2.cc`, `db/version_builder_test.cc`, `db/repair_test.cc`
- Updated tests (i.e, `DBCompactionTestL0FilesMisorderCorruption*`) under https://github.com/facebook/rocksdb/pull/5958#issue-511150930
- [Ongoing] Compatibility test: manually run https://github.com/ajkr/rocksdb/commit/36a5686ec012f35a4371e409aa85c404ca1c210d (with file ingestion off for running the `.orig` binary to prevent this bug affecting upgrade/downgrade formality checking) for 1 hour on `simple black/white box`, `cf_consistency/txn/enable_ts with whitebox + test_best_efforts_recovery with blackbox`
- [Ongoing] normal db stress test
- [Ongoing] db stress test with aggressive value https://github.com/facebook/rocksdb/pull/10761
Reviewed By: ajkr
Differential Revision: D41063187
Pulled By: hx235
fbshipit-source-id: 826cb23455de7beaabe2d16c57682a82733a32a9
2022-12-13 21:29:37 +00:00
|
|
|
kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum,
|
2023-06-22 04:49:01 +00:00
|
|
|
kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0,
|
|
|
|
/* user_defined_timestamps_persisted */ true);
|
2011-06-22 18:45:39 +00:00
|
|
|
files_.push_back(f);
|
|
|
|
}
|
|
|
|
|
2014-07-11 19:52:41 +00:00
|
|
|
int Compare() {
|
|
|
|
int diff = 0;
|
|
|
|
for (size_t i = 0; i < files_.size(); i++) {
|
|
|
|
if (file_level_.files[i].fd.GetNumber() != files_[i]->fd.GetNumber()) {
|
|
|
|
diff++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return diff;
|
2011-06-22 18:45:39 +00:00
|
|
|
}
|
create compressed_levels_ in Version, allocate its space using arena. Make Version::Get, Version::FindFile faster
Summary:
Define CompressedFileMetaData that just contains fd, smallest_slice, largest_slice. Create compressed_levels_ in Version, the space is allocated using arena
Thus increase the file meta data locality, speed up "Get" and "FindFile"
benchmark with in-memory tmpfs, could have 4% improvement under "random read" and 2% improvement under "read while writing"
benchmark command:
./db_bench --db=/mnt/db/rocksdb --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --block_size=4096 --cache_size=17179869184 --cache_numshardbits=6 --compression_type=none --compression_ratio=1 --min_level_to_compress=-1 --disable_seek_compaction=1 --hard_rate_limit=2 --write_buffer_size=134217728 --max_write_buffer_number=2 --level0_file_num_compaction_trigger=8 --target_file_size_base=33554432 --max_bytes_for_level_base=1073741824 --disable_wal=0 --sync=0 --disable_data_sync=1 --verify_checksum=1 --delete_obsolete_files_period_micros=314572800 --max_grandparent_overlap_factor=10 --max_background_compactions=4 --max_background_flushes=0 --level0_slowdown_writes_trigger=16 --level0_stop_writes_trigger=24 --statistics=0 --stats_per_interval=0 --stats_interval=1048576 --histogram=0 --use_plain_table=1 --open_files=-1 --mmap_read=1 --mmap_write=0 --memtablerep=prefix_hash --bloom_bits=10 --bloom_locality=1 --perf_level=0 --benchmarks=readwhilewriting,readwhilewriting,readwhilewriting --use_existing_db=1 --num=52428800 --threads=1 —writes_per_second=81920
Read Random:
From 1.8363 ms/op, improve to 1.7587 ms/op.
Read while writing:
From 2.985 ms/op, improve to 2.924 ms/op.
Test Plan:
make all check
Reviewers: ljin, haobo, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, igor
Differential Revision: https://reviews.facebook.net/D19419
2014-07-10 05:14:39 +00:00
|
|
|
};
|
|
|
|
|
2015-03-17 21:08:00 +00:00
|
|
|
TEST_F(GenerateLevelFilesBriefTest, Empty) {
|
2014-10-28 17:03:13 +00:00
|
|
|
DoGenerateLevelFilesBrief(&file_level_, files_, &arena_);
|
2014-07-20 14:56:40 +00:00
|
|
|
ASSERT_EQ(0u, file_level_.num_files);
|
2014-07-11 19:52:41 +00:00
|
|
|
ASSERT_EQ(0, Compare());
|
create compressed_levels_ in Version, allocate its space using arena. Make Version::Get, Version::FindFile faster
Summary:
Define CompressedFileMetaData that just contains fd, smallest_slice, largest_slice. Create compressed_levels_ in Version, the space is allocated using arena
Thus increase the file meta data locality, speed up "Get" and "FindFile"
benchmark with in-memory tmpfs, could have 4% improvement under "random read" and 2% improvement under "read while writing"
benchmark command:
./db_bench --db=/mnt/db/rocksdb --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --block_size=4096 --cache_size=17179869184 --cache_numshardbits=6 --compression_type=none --compression_ratio=1 --min_level_to_compress=-1 --disable_seek_compaction=1 --hard_rate_limit=2 --write_buffer_size=134217728 --max_write_buffer_number=2 --level0_file_num_compaction_trigger=8 --target_file_size_base=33554432 --max_bytes_for_level_base=1073741824 --disable_wal=0 --sync=0 --disable_data_sync=1 --verify_checksum=1 --delete_obsolete_files_period_micros=314572800 --max_grandparent_overlap_factor=10 --max_background_compactions=4 --max_background_flushes=0 --level0_slowdown_writes_trigger=16 --level0_stop_writes_trigger=24 --statistics=0 --stats_per_interval=0 --stats_interval=1048576 --histogram=0 --use_plain_table=1 --open_files=-1 --mmap_read=1 --mmap_write=0 --memtablerep=prefix_hash --bloom_bits=10 --bloom_locality=1 --perf_level=0 --benchmarks=readwhilewriting,readwhilewriting,readwhilewriting --use_existing_db=1 --num=52428800 --threads=1 —writes_per_second=81920
Read Random:
From 1.8363 ms/op, improve to 1.7587 ms/op.
Read while writing:
From 2.985 ms/op, improve to 2.924 ms/op.
Test Plan:
make all check
Reviewers: ljin, haobo, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, igor
Differential Revision: https://reviews.facebook.net/D19419
2014-07-10 05:14:39 +00:00
|
|
|
}
|
|
|
|
|
2015-03-17 21:08:00 +00:00
|
|
|
TEST_F(GenerateLevelFilesBriefTest, Single) {
|
create compressed_levels_ in Version, allocate its space using arena. Make Version::Get, Version::FindFile faster
Summary:
Define CompressedFileMetaData that just contains fd, smallest_slice, largest_slice. Create compressed_levels_ in Version, the space is allocated using arena
Thus increase the file meta data locality, speed up "Get" and "FindFile"
benchmark with in-memory tmpfs, could have 4% improvement under "random read" and 2% improvement under "read while writing"
benchmark command:
./db_bench --db=/mnt/db/rocksdb --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --block_size=4096 --cache_size=17179869184 --cache_numshardbits=6 --compression_type=none --compression_ratio=1 --min_level_to_compress=-1 --disable_seek_compaction=1 --hard_rate_limit=2 --write_buffer_size=134217728 --max_write_buffer_number=2 --level0_file_num_compaction_trigger=8 --target_file_size_base=33554432 --max_bytes_for_level_base=1073741824 --disable_wal=0 --sync=0 --disable_data_sync=1 --verify_checksum=1 --delete_obsolete_files_period_micros=314572800 --max_grandparent_overlap_factor=10 --max_background_compactions=4 --max_background_flushes=0 --level0_slowdown_writes_trigger=16 --level0_stop_writes_trigger=24 --statistics=0 --stats_per_interval=0 --stats_interval=1048576 --histogram=0 --use_plain_table=1 --open_files=-1 --mmap_read=1 --mmap_write=0 --memtablerep=prefix_hash --bloom_bits=10 --bloom_locality=1 --perf_level=0 --benchmarks=readwhilewriting,readwhilewriting,readwhilewriting --use_existing_db=1 --num=52428800 --threads=1 —writes_per_second=81920
Read Random:
From 1.8363 ms/op, improve to 1.7587 ms/op.
Read while writing:
From 2.985 ms/op, improve to 2.924 ms/op.
Test Plan:
make all check
Reviewers: ljin, haobo, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, igor
Differential Revision: https://reviews.facebook.net/D19419
2014-07-10 05:14:39 +00:00
|
|
|
Add("p", "q");
|
2014-10-28 17:03:13 +00:00
|
|
|
DoGenerateLevelFilesBrief(&file_level_, files_, &arena_);
|
2014-07-20 14:56:40 +00:00
|
|
|
ASSERT_EQ(1u, file_level_.num_files);
|
2014-07-11 19:52:41 +00:00
|
|
|
ASSERT_EQ(0, Compare());
|
create compressed_levels_ in Version, allocate its space using arena. Make Version::Get, Version::FindFile faster
Summary:
Define CompressedFileMetaData that just contains fd, smallest_slice, largest_slice. Create compressed_levels_ in Version, the space is allocated using arena
Thus increase the file meta data locality, speed up "Get" and "FindFile"
benchmark with in-memory tmpfs, could have 4% improvement under "random read" and 2% improvement under "read while writing"
benchmark command:
./db_bench --db=/mnt/db/rocksdb --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --block_size=4096 --cache_size=17179869184 --cache_numshardbits=6 --compression_type=none --compression_ratio=1 --min_level_to_compress=-1 --disable_seek_compaction=1 --hard_rate_limit=2 --write_buffer_size=134217728 --max_write_buffer_number=2 --level0_file_num_compaction_trigger=8 --target_file_size_base=33554432 --max_bytes_for_level_base=1073741824 --disable_wal=0 --sync=0 --disable_data_sync=1 --verify_checksum=1 --delete_obsolete_files_period_micros=314572800 --max_grandparent_overlap_factor=10 --max_background_compactions=4 --max_background_flushes=0 --level0_slowdown_writes_trigger=16 --level0_stop_writes_trigger=24 --statistics=0 --stats_per_interval=0 --stats_interval=1048576 --histogram=0 --use_plain_table=1 --open_files=-1 --mmap_read=1 --mmap_write=0 --memtablerep=prefix_hash --bloom_bits=10 --bloom_locality=1 --perf_level=0 --benchmarks=readwhilewriting,readwhilewriting,readwhilewriting --use_existing_db=1 --num=52428800 --threads=1 —writes_per_second=81920
Read Random:
From 1.8363 ms/op, improve to 1.7587 ms/op.
Read while writing:
From 2.985 ms/op, improve to 2.924 ms/op.
Test Plan:
make all check
Reviewers: ljin, haobo, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, igor
Differential Revision: https://reviews.facebook.net/D19419
2014-07-10 05:14:39 +00:00
|
|
|
}
|
|
|
|
|
2015-03-17 21:08:00 +00:00
|
|
|
TEST_F(GenerateLevelFilesBriefTest, Multiple) {
|
create compressed_levels_ in Version, allocate its space using arena. Make Version::Get, Version::FindFile faster
Summary:
Define CompressedFileMetaData that just contains fd, smallest_slice, largest_slice. Create compressed_levels_ in Version, the space is allocated using arena
Thus increase the file meta data locality, speed up "Get" and "FindFile"
benchmark with in-memory tmpfs, could have 4% improvement under "random read" and 2% improvement under "read while writing"
benchmark command:
./db_bench --db=/mnt/db/rocksdb --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --block_size=4096 --cache_size=17179869184 --cache_numshardbits=6 --compression_type=none --compression_ratio=1 --min_level_to_compress=-1 --disable_seek_compaction=1 --hard_rate_limit=2 --write_buffer_size=134217728 --max_write_buffer_number=2 --level0_file_num_compaction_trigger=8 --target_file_size_base=33554432 --max_bytes_for_level_base=1073741824 --disable_wal=0 --sync=0 --disable_data_sync=1 --verify_checksum=1 --delete_obsolete_files_period_micros=314572800 --max_grandparent_overlap_factor=10 --max_background_compactions=4 --max_background_flushes=0 --level0_slowdown_writes_trigger=16 --level0_stop_writes_trigger=24 --statistics=0 --stats_per_interval=0 --stats_interval=1048576 --histogram=0 --use_plain_table=1 --open_files=-1 --mmap_read=1 --mmap_write=0 --memtablerep=prefix_hash --bloom_bits=10 --bloom_locality=1 --perf_level=0 --benchmarks=readwhilewriting,readwhilewriting,readwhilewriting --use_existing_db=1 --num=52428800 --threads=1 —writes_per_second=81920
Read Random:
From 1.8363 ms/op, improve to 1.7587 ms/op.
Read while writing:
From 2.985 ms/op, improve to 2.924 ms/op.
Test Plan:
make all check
Reviewers: ljin, haobo, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, igor
Differential Revision: https://reviews.facebook.net/D19419
2014-07-10 05:14:39 +00:00
|
|
|
Add("150", "200");
|
|
|
|
Add("200", "250");
|
|
|
|
Add("300", "350");
|
|
|
|
Add("400", "450");
|
2014-10-28 17:03:13 +00:00
|
|
|
DoGenerateLevelFilesBrief(&file_level_, files_, &arena_);
|
2014-07-20 14:56:40 +00:00
|
|
|
ASSERT_EQ(4u, file_level_.num_files);
|
2014-07-11 19:52:41 +00:00
|
|
|
ASSERT_EQ(0, Compare());
|
create compressed_levels_ in Version, allocate its space using arena. Make Version::Get, Version::FindFile faster
Summary:
Define CompressedFileMetaData that just contains fd, smallest_slice, largest_slice. Create compressed_levels_ in Version, the space is allocated using arena
Thus increase the file meta data locality, speed up "Get" and "FindFile"
benchmark with in-memory tmpfs, could have 4% improvement under "random read" and 2% improvement under "read while writing"
benchmark command:
./db_bench --db=/mnt/db/rocksdb --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --block_size=4096 --cache_size=17179869184 --cache_numshardbits=6 --compression_type=none --compression_ratio=1 --min_level_to_compress=-1 --disable_seek_compaction=1 --hard_rate_limit=2 --write_buffer_size=134217728 --max_write_buffer_number=2 --level0_file_num_compaction_trigger=8 --target_file_size_base=33554432 --max_bytes_for_level_base=1073741824 --disable_wal=0 --sync=0 --disable_data_sync=1 --verify_checksum=1 --delete_obsolete_files_period_micros=314572800 --max_grandparent_overlap_factor=10 --max_background_compactions=4 --max_background_flushes=0 --level0_slowdown_writes_trigger=16 --level0_stop_writes_trigger=24 --statistics=0 --stats_per_interval=0 --stats_interval=1048576 --histogram=0 --use_plain_table=1 --open_files=-1 --mmap_read=1 --mmap_write=0 --memtablerep=prefix_hash --bloom_bits=10 --bloom_locality=1 --perf_level=0 --benchmarks=readwhilewriting,readwhilewriting,readwhilewriting --use_existing_db=1 --num=52428800 --threads=1 —writes_per_second=81920
Read Random:
From 1.8363 ms/op, improve to 1.7587 ms/op.
Read while writing:
From 2.985 ms/op, improve to 2.924 ms/op.
Test Plan:
make all check
Reviewers: ljin, haobo, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, igor
Differential Revision: https://reviews.facebook.net/D19419
2014-07-10 05:14:39 +00:00
|
|
|
}
|
|
|
|
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 19:44:17 +00:00
|
|
|
class CountingLogger : public Logger {
|
|
|
|
public:
|
|
|
|
CountingLogger() : log_count(0) {}
|
|
|
|
using Logger::Logv;
|
2019-02-19 21:36:04 +00:00
|
|
|
void Logv(const char* /*format*/, va_list /*ap*/) override { log_count++; }
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 19:44:17 +00:00
|
|
|
int log_count;
|
|
|
|
};
|
|
|
|
|
|
|
|
Options GetOptionsWithNumLevels(int num_levels,
|
|
|
|
std::shared_ptr<CountingLogger> logger) {
|
|
|
|
Options opt;
|
|
|
|
opt.num_levels = num_levels;
|
|
|
|
opt.info_log = logger;
|
|
|
|
return opt;
|
|
|
|
}
|
|
|
|
|
2020-04-10 23:03:33 +00:00
|
|
|
class VersionStorageInfoTestBase : public testing::Test {
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 19:44:17 +00:00
|
|
|
public:
|
|
|
|
const Comparator* ucmp_;
|
|
|
|
InternalKeyComparator icmp_;
|
|
|
|
std::shared_ptr<CountingLogger> logger_;
|
|
|
|
Options options_;
|
2021-05-05 20:59:21 +00:00
|
|
|
ImmutableOptions ioptions_;
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 19:44:17 +00:00
|
|
|
MutableCFOptions mutable_cf_options_;
|
|
|
|
VersionStorageInfo vstorage_;
|
|
|
|
|
|
|
|
InternalKey GetInternalKey(const char* ukey,
|
|
|
|
SequenceNumber smallest_seq = 100) {
|
|
|
|
return InternalKey(ukey, smallest_seq, kTypeValue);
|
|
|
|
}
|
|
|
|
|
2020-04-10 23:03:33 +00:00
|
|
|
explicit VersionStorageInfoTestBase(const Comparator* ucmp)
|
|
|
|
: ucmp_(ucmp),
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 19:44:17 +00:00
|
|
|
icmp_(ucmp_),
|
|
|
|
logger_(new CountingLogger()),
|
|
|
|
options_(GetOptionsWithNumLevels(6, logger_)),
|
|
|
|
ioptions_(options_),
|
2016-09-14 04:11:59 +00:00
|
|
|
mutable_cf_options_(options_),
|
2020-04-10 23:03:33 +00:00
|
|
|
vstorage_(&icmp_, ucmp_, 6, kCompactionStyleLevel,
|
|
|
|
/*src_vstorage=*/nullptr,
|
Delay bottommost level single file compactions (#11701)
Summary:
For leveled compaction, RocksDB has a special kind of compaction with reason "kBottommmostFiles" that compacts bottommost level files to clear data held by snapshots (more detail in https://github.com/facebook/rocksdb/issues/3009). Such compactions can happen soon after a relevant snapshot is released. For some use cases, a bottommost file may contain only a small amount of keys that can be cleared, so compacting such a file has a high write amp. In addition, these bottommost files may be compacted in compactions with reason other than "kBottommmostFiles" if we wait for some time (so that enough data is ingested to trigger such a compaction). This PR introduces an option `bottommost_file_compaction_delay` to specify the delay of these bottommost level single file compactions.
* The main change is in `VersionStorageInfo::ComputeBottommostFilesMarkedForCompaction()` where we only add a file to `bottommost_files_marked_for_compaction_` if it oldest_snapshot is larger than its non-zero largest_seqno **and** the file is old enough. Note that if a file is not old enough but its largest_seqno is less than oldest_snapshot, we exclude it from the calculation of `bottommost_files_mark_threshold_`. This makes the change simpler, but such a file's eligibility for compaction will only be checked the next time `ComputeBottommostFilesMarkedForCompaction()` is called. This happens when a new Version is created (compaction, flush, SetOptions()...), a new enough snapshot is released (`VersionStorageInfo::UpdateOldestSnapshot()`) or when a compaction is picked and compaction score has to be re-calculated.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11701
Test Plan:
* Add two unit tests to test when bottommost_file_compaction_delay > 0.
* Ran crash test with the new option.
Reviewed By: jaykorean, ajkr
Differential Revision: D48331564
Pulled By: cbi42
fbshipit-source-id: c584f3dc5f6354fce3ed65f4c6366dc450b15ba8
2023-08-17 00:45:44 +00:00
|
|
|
/*_force_consistency_checks=*/false,
|
|
|
|
EpochNumberRequirement::kMustPresent, ioptions_.clock,
|
2023-10-27 22:56:48 +00:00
|
|
|
mutable_cf_options_.bottommost_file_compaction_delay,
|
2023-11-06 19:43:59 +00:00
|
|
|
OffpeakTimeOption()) {}
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 19:44:17 +00:00
|
|
|
|
2020-04-10 23:03:33 +00:00
|
|
|
~VersionStorageInfoTestBase() override {
|
|
|
|
for (int i = 0; i < vstorage_.num_levels(); ++i) {
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 19:44:17 +00:00
|
|
|
for (auto* f : vstorage_.LevelFiles(i)) {
|
|
|
|
if (--f->refs == 0) {
|
|
|
|
delete f;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void Add(int level, uint32_t file_number, const char* smallest,
|
Make it possible to force the garbage collection of the oldest blob files (#8994)
Summary:
The current BlobDB garbage collection logic works by relocating the valid
blobs from the oldest blob files as they are encountered during compaction,
and cleaning up blob files once they contain nothing but garbage. However,
with sufficiently skewed workloads, it is theoretically possible to end up in a
situation when few or no compactions get scheduled for the SST files that contain
references to the oldest blob files, which can lead to increased space amp due
to the lack of GC.
In order to efficiently handle such workloads, the patch adds a new BlobDB
configuration option called `blob_garbage_collection_force_threshold`,
which signals to BlobDB to schedule targeted compactions for the SST files
that keep alive the oldest batch of blob files if the overall ratio of garbage in
the given blob files meets the threshold *and* all the given blob files are
eligible for GC based on `blob_garbage_collection_age_cutoff`. (For example,
if the new option is set to 0.9, targeted compactions will get scheduled if the
sum of garbage bytes meets or exceeds 90% of the sum of total bytes in the
oldest blob files, assuming all affected blob files are below the age-based cutoff.)
The net result of these targeted compactions is that the valid blobs in the oldest
blob files are relocated and the oldest blob files themselves cleaned up (since
*all* SST files that rely on them get compacted away).
These targeted compactions are similar to periodic compactions in the sense
that they force certain SST files that otherwise would not get picked up to undergo
compaction and also in the sense that instead of merging files from multiple levels,
they target a single file. (Note: such compactions might still include neighboring files
from the same level due to the need of having a "clean cut" boundary but they never
include any files from any other level.)
This functionality is currently only supported with the leveled compaction style
and is inactive by default (since the default value is set to 1.0, i.e. 100%).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8994
Test Plan: Ran `make check` and tested using `db_bench` and the stress/crash tests.
Reviewed By: riversand963
Differential Revision: D31489850
Pulled By: ltamasi
fbshipit-source-id: 44057d511726a0e2a03c5d9313d7511b3f0c4eab
2021-10-12 01:00:44 +00:00
|
|
|
const char* largest, uint64_t file_size = 0,
|
2022-12-29 21:28:24 +00:00
|
|
|
uint64_t oldest_blob_file_number = kInvalidBlobFileNumber,
|
|
|
|
uint64_t compensated_range_deletion_size = 0) {
|
Make it possible to force the garbage collection of the oldest blob files (#8994)
Summary:
The current BlobDB garbage collection logic works by relocating the valid
blobs from the oldest blob files as they are encountered during compaction,
and cleaning up blob files once they contain nothing but garbage. However,
with sufficiently skewed workloads, it is theoretically possible to end up in a
situation when few or no compactions get scheduled for the SST files that contain
references to the oldest blob files, which can lead to increased space amp due
to the lack of GC.
In order to efficiently handle such workloads, the patch adds a new BlobDB
configuration option called `blob_garbage_collection_force_threshold`,
which signals to BlobDB to schedule targeted compactions for the SST files
that keep alive the oldest batch of blob files if the overall ratio of garbage in
the given blob files meets the threshold *and* all the given blob files are
eligible for GC based on `blob_garbage_collection_age_cutoff`. (For example,
if the new option is set to 0.9, targeted compactions will get scheduled if the
sum of garbage bytes meets or exceeds 90% of the sum of total bytes in the
oldest blob files, assuming all affected blob files are below the age-based cutoff.)
The net result of these targeted compactions is that the valid blobs in the oldest
blob files are relocated and the oldest blob files themselves cleaned up (since
*all* SST files that rely on them get compacted away).
These targeted compactions are similar to periodic compactions in the sense
that they force certain SST files that otherwise would not get picked up to undergo
compaction and also in the sense that instead of merging files from multiple levels,
they target a single file. (Note: such compactions might still include neighboring files
from the same level due to the need of having a "clean cut" boundary but they never
include any files from any other level.)
This functionality is currently only supported with the leveled compaction style
and is inactive by default (since the default value is set to 1.0, i.e. 100%).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8994
Test Plan: Ran `make check` and tested using `db_bench` and the stress/crash tests.
Reviewed By: riversand963
Differential Revision: D31489850
Pulled By: ltamasi
fbshipit-source-id: 44057d511726a0e2a03c5d9313d7511b3f0c4eab
2021-10-12 01:00:44 +00:00
|
|
|
constexpr SequenceNumber dummy_seq = 0;
|
|
|
|
|
|
|
|
Add(level, file_number, GetInternalKey(smallest, dummy_seq),
|
2022-12-29 21:28:24 +00:00
|
|
|
GetInternalKey(largest, dummy_seq), file_size, oldest_blob_file_number,
|
|
|
|
compensated_range_deletion_size);
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 19:44:17 +00:00
|
|
|
}
|
2018-07-14 00:34:54 +00:00
|
|
|
|
|
|
|
void Add(int level, uint32_t file_number, const InternalKey& smallest,
|
Make it possible to force the garbage collection of the oldest blob files (#8994)
Summary:
The current BlobDB garbage collection logic works by relocating the valid
blobs from the oldest blob files as they are encountered during compaction,
and cleaning up blob files once they contain nothing but garbage. However,
with sufficiently skewed workloads, it is theoretically possible to end up in a
situation when few or no compactions get scheduled for the SST files that contain
references to the oldest blob files, which can lead to increased space amp due
to the lack of GC.
In order to efficiently handle such workloads, the patch adds a new BlobDB
configuration option called `blob_garbage_collection_force_threshold`,
which signals to BlobDB to schedule targeted compactions for the SST files
that keep alive the oldest batch of blob files if the overall ratio of garbage in
the given blob files meets the threshold *and* all the given blob files are
eligible for GC based on `blob_garbage_collection_age_cutoff`. (For example,
if the new option is set to 0.9, targeted compactions will get scheduled if the
sum of garbage bytes meets or exceeds 90% of the sum of total bytes in the
oldest blob files, assuming all affected blob files are below the age-based cutoff.)
The net result of these targeted compactions is that the valid blobs in the oldest
blob files are relocated and the oldest blob files themselves cleaned up (since
*all* SST files that rely on them get compacted away).
These targeted compactions are similar to periodic compactions in the sense
that they force certain SST files that otherwise would not get picked up to undergo
compaction and also in the sense that instead of merging files from multiple levels,
they target a single file. (Note: such compactions might still include neighboring files
from the same level due to the need of having a "clean cut" boundary but they never
include any files from any other level.)
This functionality is currently only supported with the leveled compaction style
and is inactive by default (since the default value is set to 1.0, i.e. 100%).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8994
Test Plan: Ran `make check` and tested using `db_bench` and the stress/crash tests.
Reviewed By: riversand963
Differential Revision: D31489850
Pulled By: ltamasi
fbshipit-source-id: 44057d511726a0e2a03c5d9313d7511b3f0c4eab
2021-10-12 01:00:44 +00:00
|
|
|
const InternalKey& largest, uint64_t file_size = 0,
|
2022-12-29 21:28:24 +00:00
|
|
|
uint64_t oldest_blob_file_number = kInvalidBlobFileNumber,
|
|
|
|
uint64_t compensated_range_deletion_size = 0) {
|
2018-07-14 00:34:54 +00:00
|
|
|
assert(level < vstorage_.num_levels());
|
2019-10-14 22:19:31 +00:00
|
|
|
FileMetaData* f = new FileMetaData(
|
|
|
|
file_number, 0, file_size, smallest, largest, /* smallest_seq */ 0,
|
|
|
|
/* largest_seq */ 0, /* marked_for_compact */ false,
|
2021-12-03 22:42:05 +00:00
|
|
|
Temperature::kUnknown, oldest_blob_file_number,
|
|
|
|
kUnknownOldestAncesterTime, kUnknownFileCreationTime,
|
Sort L0 files by newly introduced epoch_num (#10922)
Summary:
**Context:**
Sorting L0 files by `largest_seqno` has at least two inconvenience:
- File ingestion and compaction involving ingested files can create files of overlapping seqno range with the existing files. `force_consistency_check=true` will catch such overlap seqno range even those harmless overlap.
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n")
- insert k1@1 to memtable m1
- ingest file s1 with k2@2, ingest file s2 with k3@3
- insert k4@4 to m1
- compact files s1, s2 and result in new file s3 of seqno range [2, 3]
- flush m1 and result in new file s4 of seqno range [1, 4]. And `force_consistency_check=true` will think s4 and s3 has file reordering corruption that might cause retuning an old value of k1
- However such caught corruption is a false positive since s1, s2 will not have overlapped keys with k1 or whatever inserted into m1 before ingest file s1 by the requirement of file ingestion (otherwise the m1 will be flushed first before any of the file ingestion completes). Therefore there in fact isn't any file reordering corruption.
- Single delete can decrease a file's largest seqno and ordering by `largest_seqno` can introduce a wrong ordering hence file reordering corruption
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n", Credit to ajkr for this example)
- an existing SST s1 contains only k1@1
- insert k1@2 to memtable m1
- ingest file s2 with k3@3, ingest file s3 with k4@4
- insert single delete k5@5 in m1
- flush m1 and result in new file s4 of seqno range [2, 5]
- compact s1, s2, s3 and result in new file s5 of seqno range [1, 4]
- compact s4 and result in new file s6 of seqno range [2] due to single delete
- By the last step, we have file ordering by largest seqno (">" means "newer") : s5 > s6 while s6 contains a newer version of the k1's value (i.e, k1@2) than s5, which is a real reordering corruption. While this can be caught by `force_consistency_check=true`, there isn't a good way to prevent this from happening if ordering by `largest_seqno`
Therefore, we are redesigning the sorting criteria of L0 files and avoid above inconvenience. Credit to ajkr , we now introduce `epoch_num` which describes the order of a file being flushed or ingested/imported (compaction output file will has the minimum `epoch_num` among input files'). This will avoid the above inconvenience in the following ways:
- In the first case above, there will no longer be overlap seqno range check in `force_consistency_check=true` but `epoch_number` ordering check. This will result in file ordering s1 < s2 < s4 (pre-compaction) and s3 < s4 (post-compaction) which won't trigger false positive corruption. See test class `DBCompactionTestL0FilesMisorderCorruption*` for more.
- In the second case above, this will result in file ordering s1 < s2 < s3 < s4 (pre-compacting s1, s2, s3), s5 < s4 (post-compacting s1, s2, s3), s5 < s6 (post-compacting s4), which are correct file ordering without causing any corruption.
**Summary:**
- Introduce `epoch_number` stored per `ColumnFamilyData` and sort CF's L0 files by their assigned `epoch_number` instead of `largest_seqno`.
- `epoch_number` is increased and assigned upon `VersionEdit::AddFile()` for flush (or similarly for WriteLevel0TableForRecovery) and file ingestion (except for allow_behind_true, which will always get assigned as the `kReservedEpochNumberForFileIngestedBehind`)
- Compaction output file is assigned with the minimum `epoch_number` among input files'
- Refit level: reuse refitted file's epoch_number
- Other paths needing `epoch_number` treatment:
- Import column families: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`
- Repair: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`.
- Assigning new epoch_number to a file and adding this file to LSM tree should be atomic. This is guaranteed by us assigning epoch_number right upon `VersionEdit::AddFile()` where this version edit will be apply to LSM tree shape right after by holding the db mutex (e.g, flush, file ingestion, import column family) or by there is only 1 ongoing edit per CF (e.g, WriteLevel0TableForRecovery, Repair).
- Assigning the minimum input epoch number to compaction output file won't misorder L0 files (even through later `Refit(target_level=0)`). It's due to for every key "k" in the input range, a legit compaction will cover a continuous epoch number range of that key. As long as we assign the key "k" the minimum input epoch number, it won't become newer or older than the versions of this key that aren't included in this compaction hence no misorder.
- Persist `epoch_number` of each file in manifest and recover `epoch_number` on db recovery
- Backward compatibility with old db without `epoch_number` support is guaranteed by assigning `epoch_number` to recovered files by `NewestFirstBySeqno` order. See `VersionStorageInfo::RecoverEpochNumbers()` for more
- Forward compatibility with manifest is guaranteed by flexibility of `NewFileCustomTag`
- Replace `force_consistent_check` on L0 with `epoch_number` and remove false positive check like case 1 with `largest_seqno` above
- Due to backward compatibility issue, we might encounter files with missing epoch number at the beginning of db recovery. We will still use old L0 sorting mechanism (`NewestFirstBySeqno`) to check/sort them till we infer their epoch number. See usages of `EpochNumberRequirement`.
- Remove fix https://github.com/facebook/rocksdb/pull/5958#issue-511150930 and their outdated tests to file reordering corruption because such fix can be replaced by this PR.
- Misc:
- update existing tests with `epoch_number` so make check will pass
- update https://github.com/facebook/rocksdb/pull/5958#issue-511150930 tests to verify corruption is fixed using `epoch_number` and cover universal/fifo compaction/CompactRange/CompactFile cases
- assert db_mutex is held for a few places before calling ColumnFamilyData::NewEpochNumber()
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10922
Test Plan:
- `make check`
- New unit tests under `db/db_compaction_test.cc`, `db/db_test2.cc`, `db/version_builder_test.cc`, `db/repair_test.cc`
- Updated tests (i.e, `DBCompactionTestL0FilesMisorderCorruption*`) under https://github.com/facebook/rocksdb/pull/5958#issue-511150930
- [Ongoing] Compatibility test: manually run https://github.com/ajkr/rocksdb/commit/36a5686ec012f35a4371e409aa85c404ca1c210d (with file ingestion off for running the `.orig` binary to prevent this bug affecting upgrade/downgrade formality checking) for 1 hour on `simple black/white box`, `cf_consistency/txn/enable_ts with whitebox + test_best_efforts_recovery with blackbox`
- [Ongoing] normal db stress test
- [Ongoing] db stress test with aggressive value https://github.com/facebook/rocksdb/pull/10761
Reviewed By: ajkr
Differential Revision: D41063187
Pulled By: hx235
fbshipit-source-id: 826cb23455de7beaabe2d16c57682a82733a32a9
2022-12-13 21:29:37 +00:00
|
|
|
kUnknownEpochNumber, kUnknownFileChecksum, kUnknownFileChecksumFuncName,
|
2023-06-22 04:49:01 +00:00
|
|
|
kNullUniqueId64x2, compensated_range_deletion_size, 0,
|
|
|
|
/* user_defined_timestamps_persisted */ true);
|
2018-07-14 00:34:54 +00:00
|
|
|
vstorage_.AddFile(level, f);
|
|
|
|
}
|
|
|
|
|
Make it possible to force the garbage collection of the oldest blob files (#8994)
Summary:
The current BlobDB garbage collection logic works by relocating the valid
blobs from the oldest blob files as they are encountered during compaction,
and cleaning up blob files once they contain nothing but garbage. However,
with sufficiently skewed workloads, it is theoretically possible to end up in a
situation when few or no compactions get scheduled for the SST files that contain
references to the oldest blob files, which can lead to increased space amp due
to the lack of GC.
In order to efficiently handle such workloads, the patch adds a new BlobDB
configuration option called `blob_garbage_collection_force_threshold`,
which signals to BlobDB to schedule targeted compactions for the SST files
that keep alive the oldest batch of blob files if the overall ratio of garbage in
the given blob files meets the threshold *and* all the given blob files are
eligible for GC based on `blob_garbage_collection_age_cutoff`. (For example,
if the new option is set to 0.9, targeted compactions will get scheduled if the
sum of garbage bytes meets or exceeds 90% of the sum of total bytes in the
oldest blob files, assuming all affected blob files are below the age-based cutoff.)
The net result of these targeted compactions is that the valid blobs in the oldest
blob files are relocated and the oldest blob files themselves cleaned up (since
*all* SST files that rely on them get compacted away).
These targeted compactions are similar to periodic compactions in the sense
that they force certain SST files that otherwise would not get picked up to undergo
compaction and also in the sense that instead of merging files from multiple levels,
they target a single file. (Note: such compactions might still include neighboring files
from the same level due to the need of having a "clean cut" boundary but they never
include any files from any other level.)
This functionality is currently only supported with the leveled compaction style
and is inactive by default (since the default value is set to 1.0, i.e. 100%).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8994
Test Plan: Ran `make check` and tested using `db_bench` and the stress/crash tests.
Reviewed By: riversand963
Differential Revision: D31489850
Pulled By: ltamasi
fbshipit-source-id: 44057d511726a0e2a03c5d9313d7511b3f0c4eab
2021-10-12 01:00:44 +00:00
|
|
|
void AddBlob(uint64_t blob_file_number, uint64_t total_blob_count,
|
|
|
|
uint64_t total_blob_bytes,
|
|
|
|
BlobFileMetaData::LinkedSsts linked_ssts,
|
|
|
|
uint64_t garbage_blob_count, uint64_t garbage_blob_bytes) {
|
|
|
|
auto shared_meta = SharedBlobFileMetaData::Create(
|
|
|
|
blob_file_number, total_blob_count, total_blob_bytes,
|
|
|
|
/* checksum_method */ std::string(),
|
|
|
|
/* checksum_value */ std::string());
|
|
|
|
auto meta =
|
|
|
|
BlobFileMetaData::Create(std::move(shared_meta), std::move(linked_ssts),
|
|
|
|
garbage_blob_count, garbage_blob_bytes);
|
|
|
|
|
|
|
|
vstorage_.AddBlobFile(std::move(meta));
|
|
|
|
}
|
|
|
|
|
2022-02-04 16:18:18 +00:00
|
|
|
void UpdateVersionStorageInfo() {
|
|
|
|
vstorage_.PrepareForVersionAppend(ioptions_, mutable_cf_options_);
|
Make it possible to force the garbage collection of the oldest blob files (#8994)
Summary:
The current BlobDB garbage collection logic works by relocating the valid
blobs from the oldest blob files as they are encountered during compaction,
and cleaning up blob files once they contain nothing but garbage. However,
with sufficiently skewed workloads, it is theoretically possible to end up in a
situation when few or no compactions get scheduled for the SST files that contain
references to the oldest blob files, which can lead to increased space amp due
to the lack of GC.
In order to efficiently handle such workloads, the patch adds a new BlobDB
configuration option called `blob_garbage_collection_force_threshold`,
which signals to BlobDB to schedule targeted compactions for the SST files
that keep alive the oldest batch of blob files if the overall ratio of garbage in
the given blob files meets the threshold *and* all the given blob files are
eligible for GC based on `blob_garbage_collection_age_cutoff`. (For example,
if the new option is set to 0.9, targeted compactions will get scheduled if the
sum of garbage bytes meets or exceeds 90% of the sum of total bytes in the
oldest blob files, assuming all affected blob files are below the age-based cutoff.)
The net result of these targeted compactions is that the valid blobs in the oldest
blob files are relocated and the oldest blob files themselves cleaned up (since
*all* SST files that rely on them get compacted away).
These targeted compactions are similar to periodic compactions in the sense
that they force certain SST files that otherwise would not get picked up to undergo
compaction and also in the sense that instead of merging files from multiple levels,
they target a single file. (Note: such compactions might still include neighboring files
from the same level due to the need of having a "clean cut" boundary but they never
include any files from any other level.)
This functionality is currently only supported with the leveled compaction style
and is inactive by default (since the default value is set to 1.0, i.e. 100%).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8994
Test Plan: Ran `make check` and tested using `db_bench` and the stress/crash tests.
Reviewed By: riversand963
Differential Revision: D31489850
Pulled By: ltamasi
fbshipit-source-id: 44057d511726a0e2a03c5d9313d7511b3f0c4eab
2021-10-12 01:00:44 +00:00
|
|
|
vstorage_.SetFinalized();
|
|
|
|
}
|
|
|
|
|
2018-07-14 00:34:54 +00:00
|
|
|
std::string GetOverlappingFiles(int level, const InternalKey& begin,
|
|
|
|
const InternalKey& end) {
|
|
|
|
std::vector<FileMetaData*> inputs;
|
|
|
|
vstorage_.GetOverlappingInputs(level, &begin, &end, &inputs);
|
|
|
|
|
|
|
|
std::string result;
|
|
|
|
for (size_t i = 0; i < inputs.size(); ++i) {
|
|
|
|
if (i > 0) {
|
|
|
|
result += ",";
|
|
|
|
}
|
|
|
|
AppendNumberTo(&result, inputs[i]->fd.GetNumber());
|
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 19:44:17 +00:00
|
|
|
};
|
|
|
|
|
2020-04-10 23:03:33 +00:00
|
|
|
class VersionStorageInfoTest : public VersionStorageInfoTestBase {
|
|
|
|
public:
|
|
|
|
VersionStorageInfoTest() : VersionStorageInfoTestBase(BytewiseComparator()) {}
|
|
|
|
|
2023-12-04 19:17:32 +00:00
|
|
|
~VersionStorageInfoTest() override = default;
|
2020-04-10 23:03:33 +00:00
|
|
|
};
|
|
|
|
|
2015-03-17 21:08:00 +00:00
|
|
|
TEST_F(VersionStorageInfoTest, MaxBytesForLevelStatic) {
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 19:44:17 +00:00
|
|
|
ioptions_.level_compaction_dynamic_level_bytes = false;
|
|
|
|
mutable_cf_options_.max_bytes_for_level_base = 10;
|
|
|
|
mutable_cf_options_.max_bytes_for_level_multiplier = 5;
|
|
|
|
|
2022-02-04 16:18:18 +00:00
|
|
|
Add(4, 100U, "1", "2", 100U);
|
|
|
|
Add(5, 101U, "1", "2", 100U);
|
|
|
|
|
|
|
|
UpdateVersionStorageInfo();
|
|
|
|
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 19:44:17 +00:00
|
|
|
ASSERT_EQ(vstorage_.MaxBytesForLevel(1), 10U);
|
|
|
|
ASSERT_EQ(vstorage_.MaxBytesForLevel(2), 50U);
|
|
|
|
ASSERT_EQ(vstorage_.MaxBytesForLevel(3), 250U);
|
|
|
|
ASSERT_EQ(vstorage_.MaxBytesForLevel(4), 1250U);
|
|
|
|
|
|
|
|
ASSERT_EQ(0, logger_->log_count);
|
|
|
|
}
|
|
|
|
|
2022-02-04 16:18:18 +00:00
|
|
|
TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamic_1) {
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 19:44:17 +00:00
|
|
|
ioptions_.level_compaction_dynamic_level_bytes = true;
|
|
|
|
mutable_cf_options_.max_bytes_for_level_base = 1000;
|
|
|
|
mutable_cf_options_.max_bytes_for_level_multiplier = 5;
|
2022-02-04 16:18:18 +00:00
|
|
|
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 19:44:17 +00:00
|
|
|
Add(5, 1U, "1", "2", 500U);
|
|
|
|
|
2022-02-04 16:18:18 +00:00
|
|
|
UpdateVersionStorageInfo();
|
|
|
|
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 19:44:17 +00:00
|
|
|
ASSERT_EQ(0, logger_->log_count);
|
|
|
|
ASSERT_EQ(vstorage_.base_level(), 5);
|
2022-02-04 16:18:18 +00:00
|
|
|
}
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 19:44:17 +00:00
|
|
|
|
2022-02-04 16:18:18 +00:00
|
|
|
TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamic_2) {
|
|
|
|
ioptions_.level_compaction_dynamic_level_bytes = true;
|
|
|
|
mutable_cf_options_.max_bytes_for_level_base = 1000;
|
|
|
|
mutable_cf_options_.max_bytes_for_level_multiplier = 5;
|
|
|
|
|
|
|
|
Add(5, 1U, "1", "2", 500U);
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 19:44:17 +00:00
|
|
|
Add(5, 2U, "3", "4", 550U);
|
2022-02-04 16:18:18 +00:00
|
|
|
|
|
|
|
UpdateVersionStorageInfo();
|
|
|
|
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 19:44:17 +00:00
|
|
|
ASSERT_EQ(0, logger_->log_count);
|
2017-05-05 01:03:22 +00:00
|
|
|
ASSERT_EQ(vstorage_.MaxBytesForLevel(4), 1000U);
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 19:44:17 +00:00
|
|
|
ASSERT_EQ(vstorage_.base_level(), 4);
|
2022-02-04 16:18:18 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamic_3) {
|
|
|
|
ioptions_.level_compaction_dynamic_level_bytes = true;
|
|
|
|
mutable_cf_options_.max_bytes_for_level_base = 1000;
|
|
|
|
mutable_cf_options_.max_bytes_for_level_multiplier = 5;
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 19:44:17 +00:00
|
|
|
|
2022-02-04 16:18:18 +00:00
|
|
|
Add(5, 1U, "1", "2", 500U);
|
|
|
|
Add(5, 2U, "3", "4", 550U);
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 19:44:17 +00:00
|
|
|
Add(4, 3U, "3", "4", 550U);
|
2022-02-04 16:18:18 +00:00
|
|
|
|
|
|
|
UpdateVersionStorageInfo();
|
|
|
|
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 19:44:17 +00:00
|
|
|
ASSERT_EQ(0, logger_->log_count);
|
2017-05-05 01:03:22 +00:00
|
|
|
ASSERT_EQ(vstorage_.MaxBytesForLevel(4), 1000U);
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 19:44:17 +00:00
|
|
|
ASSERT_EQ(vstorage_.base_level(), 4);
|
2022-02-04 16:18:18 +00:00
|
|
|
}
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 19:44:17 +00:00
|
|
|
|
2022-02-04 16:18:18 +00:00
|
|
|
TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamic_4) {
|
|
|
|
ioptions_.level_compaction_dynamic_level_bytes = true;
|
|
|
|
mutable_cf_options_.max_bytes_for_level_base = 1000;
|
|
|
|
mutable_cf_options_.max_bytes_for_level_multiplier = 5;
|
|
|
|
|
|
|
|
Add(5, 1U, "1", "2", 500U);
|
|
|
|
Add(5, 2U, "3", "4", 550U);
|
|
|
|
Add(4, 3U, "3", "4", 550U);
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 19:44:17 +00:00
|
|
|
Add(3, 4U, "3", "4", 250U);
|
|
|
|
Add(3, 5U, "5", "7", 300U);
|
2022-02-04 16:18:18 +00:00
|
|
|
|
|
|
|
UpdateVersionStorageInfo();
|
|
|
|
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 19:44:17 +00:00
|
|
|
ASSERT_EQ(1, logger_->log_count);
|
|
|
|
ASSERT_EQ(vstorage_.MaxBytesForLevel(4), 1005U);
|
2017-05-05 01:03:22 +00:00
|
|
|
ASSERT_EQ(vstorage_.MaxBytesForLevel(3), 1000U);
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 19:44:17 +00:00
|
|
|
ASSERT_EQ(vstorage_.base_level(), 3);
|
2022-02-04 16:18:18 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamic_5) {
|
|
|
|
ioptions_.level_compaction_dynamic_level_bytes = true;
|
|
|
|
mutable_cf_options_.max_bytes_for_level_base = 1000;
|
|
|
|
mutable_cf_options_.max_bytes_for_level_multiplier = 5;
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 19:44:17 +00:00
|
|
|
|
2022-02-04 16:18:18 +00:00
|
|
|
Add(5, 1U, "1", "2", 500U);
|
|
|
|
Add(5, 2U, "3", "4", 550U);
|
|
|
|
Add(4, 3U, "3", "4", 550U);
|
|
|
|
Add(3, 4U, "3", "4", 250U);
|
|
|
|
Add(3, 5U, "5", "7", 300U);
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 19:44:17 +00:00
|
|
|
Add(1, 6U, "3", "4", 5U);
|
|
|
|
Add(1, 7U, "8", "9", 5U);
|
2022-02-04 16:18:18 +00:00
|
|
|
|
|
|
|
UpdateVersionStorageInfo();
|
|
|
|
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 19:44:17 +00:00
|
|
|
ASSERT_EQ(1, logger_->log_count);
|
|
|
|
ASSERT_GT(vstorage_.MaxBytesForLevel(4), 1005U);
|
|
|
|
ASSERT_GT(vstorage_.MaxBytesForLevel(3), 1005U);
|
|
|
|
ASSERT_EQ(vstorage_.MaxBytesForLevel(2), 1005U);
|
2017-05-05 01:03:22 +00:00
|
|
|
ASSERT_EQ(vstorage_.MaxBytesForLevel(1), 1000U);
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 19:44:17 +00:00
|
|
|
ASSERT_EQ(vstorage_.base_level(), 1);
|
|
|
|
}
|
|
|
|
|
2015-03-17 21:08:00 +00:00
|
|
|
TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicLotsOfData) {
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 19:44:17 +00:00
|
|
|
ioptions_.level_compaction_dynamic_level_bytes = true;
|
|
|
|
mutable_cf_options_.max_bytes_for_level_base = 100;
|
|
|
|
mutable_cf_options_.max_bytes_for_level_multiplier = 2;
|
2022-02-04 16:18:18 +00:00
|
|
|
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 19:44:17 +00:00
|
|
|
Add(0, 1U, "1", "2", 50U);
|
|
|
|
Add(1, 2U, "1", "2", 50U);
|
|
|
|
Add(2, 3U, "1", "2", 500U);
|
|
|
|
Add(3, 4U, "1", "2", 500U);
|
|
|
|
Add(4, 5U, "1", "2", 1700U);
|
|
|
|
Add(5, 6U, "1", "2", 500U);
|
|
|
|
|
2022-02-04 16:18:18 +00:00
|
|
|
UpdateVersionStorageInfo();
|
|
|
|
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 19:44:17 +00:00
|
|
|
ASSERT_EQ(vstorage_.MaxBytesForLevel(4), 800U);
|
|
|
|
ASSERT_EQ(vstorage_.MaxBytesForLevel(3), 400U);
|
|
|
|
ASSERT_EQ(vstorage_.MaxBytesForLevel(2), 200U);
|
|
|
|
ASSERT_EQ(vstorage_.MaxBytesForLevel(1), 100U);
|
|
|
|
ASSERT_EQ(vstorage_.base_level(), 1);
|
|
|
|
ASSERT_EQ(0, logger_->log_count);
|
2015-04-03 05:24:50 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicLargeLevel) {
|
|
|
|
uint64_t kOneGB = 1000U * 1000U * 1000U;
|
|
|
|
ioptions_.level_compaction_dynamic_level_bytes = true;
|
|
|
|
mutable_cf_options_.max_bytes_for_level_base = 10U * kOneGB;
|
|
|
|
mutable_cf_options_.max_bytes_for_level_multiplier = 10;
|
2022-02-04 16:18:18 +00:00
|
|
|
|
2015-04-03 05:24:50 +00:00
|
|
|
Add(0, 1U, "1", "2", 50U);
|
|
|
|
Add(3, 4U, "1", "2", 32U * kOneGB);
|
|
|
|
Add(4, 5U, "1", "2", 500U * kOneGB);
|
|
|
|
Add(5, 6U, "1", "2", 3000U * kOneGB);
|
|
|
|
|
2022-02-04 16:18:18 +00:00
|
|
|
UpdateVersionStorageInfo();
|
|
|
|
|
2015-04-03 05:24:50 +00:00
|
|
|
ASSERT_EQ(vstorage_.MaxBytesForLevel(5), 3000U * kOneGB);
|
|
|
|
ASSERT_EQ(vstorage_.MaxBytesForLevel(4), 300U * kOneGB);
|
|
|
|
ASSERT_EQ(vstorage_.MaxBytesForLevel(3), 30U * kOneGB);
|
2017-05-05 01:03:22 +00:00
|
|
|
ASSERT_EQ(vstorage_.MaxBytesForLevel(2), 10U * kOneGB);
|
2015-04-03 05:24:50 +00:00
|
|
|
ASSERT_EQ(vstorage_.base_level(), 2);
|
|
|
|
ASSERT_EQ(0, logger_->log_count);
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
2015-02-05 19:44:17 +00:00
|
|
|
}
|
|
|
|
|
2018-10-22 17:18:51 +00:00
|
|
|
TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicWithLargeL0_1) {
|
|
|
|
ioptions_.level_compaction_dynamic_level_bytes = true;
|
|
|
|
mutable_cf_options_.max_bytes_for_level_base = 40000;
|
|
|
|
mutable_cf_options_.max_bytes_for_level_multiplier = 5;
|
|
|
|
mutable_cf_options_.level0_file_num_compaction_trigger = 2;
|
|
|
|
|
|
|
|
Add(0, 1U, "1", "2", 10000U);
|
|
|
|
Add(0, 2U, "1", "2", 10000U);
|
|
|
|
Add(0, 3U, "1", "2", 10000U);
|
|
|
|
|
|
|
|
Add(5, 4U, "1", "2", 1286250U);
|
|
|
|
Add(4, 5U, "1", "2", 200000U);
|
|
|
|
Add(3, 6U, "1", "2", 40000U);
|
|
|
|
Add(2, 7U, "1", "2", 8000U);
|
|
|
|
|
2022-02-04 16:18:18 +00:00
|
|
|
UpdateVersionStorageInfo();
|
|
|
|
|
2018-10-22 17:18:51 +00:00
|
|
|
ASSERT_EQ(0, logger_->log_count);
|
|
|
|
ASSERT_EQ(2, vstorage_.base_level());
|
|
|
|
// level multiplier should be 3.5
|
|
|
|
ASSERT_EQ(vstorage_.level_multiplier(), 5.0);
|
|
|
|
ASSERT_EQ(40000U, vstorage_.MaxBytesForLevel(2));
|
|
|
|
ASSERT_EQ(51450U, vstorage_.MaxBytesForLevel(3));
|
|
|
|
ASSERT_EQ(257250U, vstorage_.MaxBytesForLevel(4));
|
Change The Way Level Target And Compaction Score Are Calculated (#10057)
Summary:
The current level targets for dynamical leveling has a problem: the target level size will dramatically change after a L0->L1 compaction. When there are many L0 bytes, lower level compactions are delayed, but they will be resumed after the L0->L1 compaction finishes, so the expected write amplification benefits might not be realized. The proposal here is to revert the level targetting size, but instead relying on adjusting score for each level to prioritize levels that need to compact most.
Basic idea:
(1) target level size isn't adjusted, but score is adjusted. The reasoning is that with parallel compactions, holding compactions from happening might not be desirable, but we would like the compactions are scheduled from the level we feel most needed. For example, if we have a extra-large L2, we would like all compactions are scheduled for L2->L3 compactions, rather than L4->L5. This gets complicated when a large L0->L1 compaction is going on. Should we compact L2->L3 or L4->L5. So the proposal for that is:
(2) the score is calculated by actual level size / (target size + estimated upper bytes coming down). The reasoning is that if we have a large amount of pending L0/L1 bytes coming down, compacting L2->L3 might be more expensive, as when the L0 bytes are compacted down to L2, the actual L2->L3 fanout would change dramatically. On the other hand, when the amount of bytes coming down to L5, the impacts to L5->L6 fanout are much less. So when calculating target score, we can adjust it by adding estimated downward bytes to the target level size.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10057
Test Plan: Repurpose tests VersionStorageInfoTest.MaxBytesForLevelDynamicWithLargeL0_* tests to cover this scenario.
Reviewed By: ajkr
Differential Revision: D37539742
fbshipit-source-id: 9c154cbfe92023f918cf5d80875d8776ad4831a4
2022-06-30 20:32:47 +00:00
|
|
|
|
|
|
|
vstorage_.ComputeCompactionScore(ioptions_, mutable_cf_options_);
|
|
|
|
// Only L0 hits compaction.
|
|
|
|
ASSERT_EQ(vstorage_.CompactionScoreLevel(0), 0);
|
2018-10-22 17:18:51 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicWithLargeL0_2) {
|
|
|
|
ioptions_.level_compaction_dynamic_level_bytes = true;
|
|
|
|
mutable_cf_options_.max_bytes_for_level_base = 10000;
|
|
|
|
mutable_cf_options_.max_bytes_for_level_multiplier = 5;
|
Change The Way Level Target And Compaction Score Are Calculated (#10057)
Summary:
The current level targets for dynamical leveling has a problem: the target level size will dramatically change after a L0->L1 compaction. When there are many L0 bytes, lower level compactions are delayed, but they will be resumed after the L0->L1 compaction finishes, so the expected write amplification benefits might not be realized. The proposal here is to revert the level targetting size, but instead relying on adjusting score for each level to prioritize levels that need to compact most.
Basic idea:
(1) target level size isn't adjusted, but score is adjusted. The reasoning is that with parallel compactions, holding compactions from happening might not be desirable, but we would like the compactions are scheduled from the level we feel most needed. For example, if we have a extra-large L2, we would like all compactions are scheduled for L2->L3 compactions, rather than L4->L5. This gets complicated when a large L0->L1 compaction is going on. Should we compact L2->L3 or L4->L5. So the proposal for that is:
(2) the score is calculated by actual level size / (target size + estimated upper bytes coming down). The reasoning is that if we have a large amount of pending L0/L1 bytes coming down, compacting L2->L3 might be more expensive, as when the L0 bytes are compacted down to L2, the actual L2->L3 fanout would change dramatically. On the other hand, when the amount of bytes coming down to L5, the impacts to L5->L6 fanout are much less. So when calculating target score, we can adjust it by adding estimated downward bytes to the target level size.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10057
Test Plan: Repurpose tests VersionStorageInfoTest.MaxBytesForLevelDynamicWithLargeL0_* tests to cover this scenario.
Reviewed By: ajkr
Differential Revision: D37539742
fbshipit-source-id: 9c154cbfe92023f918cf5d80875d8776ad4831a4
2022-06-30 20:32:47 +00:00
|
|
|
mutable_cf_options_.level0_file_num_compaction_trigger = 4;
|
2018-10-22 17:18:51 +00:00
|
|
|
|
|
|
|
Add(0, 11U, "1", "2", 10000U);
|
|
|
|
Add(0, 12U, "1", "2", 10000U);
|
|
|
|
Add(0, 13U, "1", "2", 10000U);
|
|
|
|
|
Change The Way Level Target And Compaction Score Are Calculated (#10057)
Summary:
The current level targets for dynamical leveling has a problem: the target level size will dramatically change after a L0->L1 compaction. When there are many L0 bytes, lower level compactions are delayed, but they will be resumed after the L0->L1 compaction finishes, so the expected write amplification benefits might not be realized. The proposal here is to revert the level targetting size, but instead relying on adjusting score for each level to prioritize levels that need to compact most.
Basic idea:
(1) target level size isn't adjusted, but score is adjusted. The reasoning is that with parallel compactions, holding compactions from happening might not be desirable, but we would like the compactions are scheduled from the level we feel most needed. For example, if we have a extra-large L2, we would like all compactions are scheduled for L2->L3 compactions, rather than L4->L5. This gets complicated when a large L0->L1 compaction is going on. Should we compact L2->L3 or L4->L5. So the proposal for that is:
(2) the score is calculated by actual level size / (target size + estimated upper bytes coming down). The reasoning is that if we have a large amount of pending L0/L1 bytes coming down, compacting L2->L3 might be more expensive, as when the L0 bytes are compacted down to L2, the actual L2->L3 fanout would change dramatically. On the other hand, when the amount of bytes coming down to L5, the impacts to L5->L6 fanout are much less. So when calculating target score, we can adjust it by adding estimated downward bytes to the target level size.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10057
Test Plan: Repurpose tests VersionStorageInfoTest.MaxBytesForLevelDynamicWithLargeL0_* tests to cover this scenario.
Reviewed By: ajkr
Differential Revision: D37539742
fbshipit-source-id: 9c154cbfe92023f918cf5d80875d8776ad4831a4
2022-06-30 20:32:47 +00:00
|
|
|
// Level size should be around 10,000, 10,290, 51,450, 257,250
|
2018-10-22 17:18:51 +00:00
|
|
|
Add(5, 4U, "1", "2", 1286250U);
|
Change The Way Level Target And Compaction Score Are Calculated (#10057)
Summary:
The current level targets for dynamical leveling has a problem: the target level size will dramatically change after a L0->L1 compaction. When there are many L0 bytes, lower level compactions are delayed, but they will be resumed after the L0->L1 compaction finishes, so the expected write amplification benefits might not be realized. The proposal here is to revert the level targetting size, but instead relying on adjusting score for each level to prioritize levels that need to compact most.
Basic idea:
(1) target level size isn't adjusted, but score is adjusted. The reasoning is that with parallel compactions, holding compactions from happening might not be desirable, but we would like the compactions are scheduled from the level we feel most needed. For example, if we have a extra-large L2, we would like all compactions are scheduled for L2->L3 compactions, rather than L4->L5. This gets complicated when a large L0->L1 compaction is going on. Should we compact L2->L3 or L4->L5. So the proposal for that is:
(2) the score is calculated by actual level size / (target size + estimated upper bytes coming down). The reasoning is that if we have a large amount of pending L0/L1 bytes coming down, compacting L2->L3 might be more expensive, as when the L0 bytes are compacted down to L2, the actual L2->L3 fanout would change dramatically. On the other hand, when the amount of bytes coming down to L5, the impacts to L5->L6 fanout are much less. So when calculating target score, we can adjust it by adding estimated downward bytes to the target level size.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10057
Test Plan: Repurpose tests VersionStorageInfoTest.MaxBytesForLevelDynamicWithLargeL0_* tests to cover this scenario.
Reviewed By: ajkr
Differential Revision: D37539742
fbshipit-source-id: 9c154cbfe92023f918cf5d80875d8776ad4831a4
2022-06-30 20:32:47 +00:00
|
|
|
Add(4, 5U, "1", "2", 258000U); // unadjusted score 1.003
|
|
|
|
Add(3, 6U, "1", "2", 53000U); // unadjusted score 1.03
|
|
|
|
Add(2, 7U, "1", "2", 20000U); // unadjusted score 1.94
|
2018-10-22 17:18:51 +00:00
|
|
|
|
2022-02-04 16:18:18 +00:00
|
|
|
UpdateVersionStorageInfo();
|
|
|
|
|
2018-10-22 17:18:51 +00:00
|
|
|
ASSERT_EQ(0, logger_->log_count);
|
Change The Way Level Target And Compaction Score Are Calculated (#10057)
Summary:
The current level targets for dynamical leveling has a problem: the target level size will dramatically change after a L0->L1 compaction. When there are many L0 bytes, lower level compactions are delayed, but they will be resumed after the L0->L1 compaction finishes, so the expected write amplification benefits might not be realized. The proposal here is to revert the level targetting size, but instead relying on adjusting score for each level to prioritize levels that need to compact most.
Basic idea:
(1) target level size isn't adjusted, but score is adjusted. The reasoning is that with parallel compactions, holding compactions from happening might not be desirable, but we would like the compactions are scheduled from the level we feel most needed. For example, if we have a extra-large L2, we would like all compactions are scheduled for L2->L3 compactions, rather than L4->L5. This gets complicated when a large L0->L1 compaction is going on. Should we compact L2->L3 or L4->L5. So the proposal for that is:
(2) the score is calculated by actual level size / (target size + estimated upper bytes coming down). The reasoning is that if we have a large amount of pending L0/L1 bytes coming down, compacting L2->L3 might be more expensive, as when the L0 bytes are compacted down to L2, the actual L2->L3 fanout would change dramatically. On the other hand, when the amount of bytes coming down to L5, the impacts to L5->L6 fanout are much less. So when calculating target score, we can adjust it by adding estimated downward bytes to the target level size.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10057
Test Plan: Repurpose tests VersionStorageInfoTest.MaxBytesForLevelDynamicWithLargeL0_* tests to cover this scenario.
Reviewed By: ajkr
Differential Revision: D37539742
fbshipit-source-id: 9c154cbfe92023f918cf5d80875d8776ad4831a4
2022-06-30 20:32:47 +00:00
|
|
|
ASSERT_EQ(1, vstorage_.base_level());
|
|
|
|
ASSERT_EQ(10000U, vstorage_.MaxBytesForLevel(1));
|
|
|
|
ASSERT_EQ(10290U, vstorage_.MaxBytesForLevel(2));
|
|
|
|
ASSERT_EQ(51450U, vstorage_.MaxBytesForLevel(3));
|
|
|
|
ASSERT_EQ(257250U, vstorage_.MaxBytesForLevel(4));
|
|
|
|
|
|
|
|
vstorage_.ComputeCompactionScore(ioptions_, mutable_cf_options_);
|
|
|
|
// Although L2 and l3 have higher unadjusted compaction score, considering
|
|
|
|
// a relatively large L0 being compacted down soon, L4 is picked up for
|
|
|
|
// compaction.
|
|
|
|
// L0 is still picked up for oversizing.
|
|
|
|
ASSERT_EQ(0, vstorage_.CompactionScoreLevel(0));
|
|
|
|
ASSERT_EQ(4, vstorage_.CompactionScoreLevel(1));
|
2018-10-22 17:18:51 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicWithLargeL0_3) {
|
|
|
|
ioptions_.level_compaction_dynamic_level_bytes = true;
|
Change The Way Level Target And Compaction Score Are Calculated (#10057)
Summary:
The current level targets for dynamical leveling has a problem: the target level size will dramatically change after a L0->L1 compaction. When there are many L0 bytes, lower level compactions are delayed, but they will be resumed after the L0->L1 compaction finishes, so the expected write amplification benefits might not be realized. The proposal here is to revert the level targetting size, but instead relying on adjusting score for each level to prioritize levels that need to compact most.
Basic idea:
(1) target level size isn't adjusted, but score is adjusted. The reasoning is that with parallel compactions, holding compactions from happening might not be desirable, but we would like the compactions are scheduled from the level we feel most needed. For example, if we have a extra-large L2, we would like all compactions are scheduled for L2->L3 compactions, rather than L4->L5. This gets complicated when a large L0->L1 compaction is going on. Should we compact L2->L3 or L4->L5. So the proposal for that is:
(2) the score is calculated by actual level size / (target size + estimated upper bytes coming down). The reasoning is that if we have a large amount of pending L0/L1 bytes coming down, compacting L2->L3 might be more expensive, as when the L0 bytes are compacted down to L2, the actual L2->L3 fanout would change dramatically. On the other hand, when the amount of bytes coming down to L5, the impacts to L5->L6 fanout are much less. So when calculating target score, we can adjust it by adding estimated downward bytes to the target level size.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10057
Test Plan: Repurpose tests VersionStorageInfoTest.MaxBytesForLevelDynamicWithLargeL0_* tests to cover this scenario.
Reviewed By: ajkr
Differential Revision: D37539742
fbshipit-source-id: 9c154cbfe92023f918cf5d80875d8776ad4831a4
2022-06-30 20:32:47 +00:00
|
|
|
mutable_cf_options_.max_bytes_for_level_base = 20000;
|
2018-10-22 17:18:51 +00:00
|
|
|
mutable_cf_options_.max_bytes_for_level_multiplier = 5;
|
Change The Way Level Target And Compaction Score Are Calculated (#10057)
Summary:
The current level targets for dynamical leveling has a problem: the target level size will dramatically change after a L0->L1 compaction. When there are many L0 bytes, lower level compactions are delayed, but they will be resumed after the L0->L1 compaction finishes, so the expected write amplification benefits might not be realized. The proposal here is to revert the level targetting size, but instead relying on adjusting score for each level to prioritize levels that need to compact most.
Basic idea:
(1) target level size isn't adjusted, but score is adjusted. The reasoning is that with parallel compactions, holding compactions from happening might not be desirable, but we would like the compactions are scheduled from the level we feel most needed. For example, if we have a extra-large L2, we would like all compactions are scheduled for L2->L3 compactions, rather than L4->L5. This gets complicated when a large L0->L1 compaction is going on. Should we compact L2->L3 or L4->L5. So the proposal for that is:
(2) the score is calculated by actual level size / (target size + estimated upper bytes coming down). The reasoning is that if we have a large amount of pending L0/L1 bytes coming down, compacting L2->L3 might be more expensive, as when the L0 bytes are compacted down to L2, the actual L2->L3 fanout would change dramatically. On the other hand, when the amount of bytes coming down to L5, the impacts to L5->L6 fanout are much less. So when calculating target score, we can adjust it by adding estimated downward bytes to the target level size.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10057
Test Plan: Repurpose tests VersionStorageInfoTest.MaxBytesForLevelDynamicWithLargeL0_* tests to cover this scenario.
Reviewed By: ajkr
Differential Revision: D37539742
fbshipit-source-id: 9c154cbfe92023f918cf5d80875d8776ad4831a4
2022-06-30 20:32:47 +00:00
|
|
|
mutable_cf_options_.level0_file_num_compaction_trigger = 5;
|
2018-10-22 17:18:51 +00:00
|
|
|
|
Change The Way Level Target And Compaction Score Are Calculated (#10057)
Summary:
The current level targets for dynamical leveling has a problem: the target level size will dramatically change after a L0->L1 compaction. When there are many L0 bytes, lower level compactions are delayed, but they will be resumed after the L0->L1 compaction finishes, so the expected write amplification benefits might not be realized. The proposal here is to revert the level targetting size, but instead relying on adjusting score for each level to prioritize levels that need to compact most.
Basic idea:
(1) target level size isn't adjusted, but score is adjusted. The reasoning is that with parallel compactions, holding compactions from happening might not be desirable, but we would like the compactions are scheduled from the level we feel most needed. For example, if we have a extra-large L2, we would like all compactions are scheduled for L2->L3 compactions, rather than L4->L5. This gets complicated when a large L0->L1 compaction is going on. Should we compact L2->L3 or L4->L5. So the proposal for that is:
(2) the score is calculated by actual level size / (target size + estimated upper bytes coming down). The reasoning is that if we have a large amount of pending L0/L1 bytes coming down, compacting L2->L3 might be more expensive, as when the L0 bytes are compacted down to L2, the actual L2->L3 fanout would change dramatically. On the other hand, when the amount of bytes coming down to L5, the impacts to L5->L6 fanout are much less. So when calculating target score, we can adjust it by adding estimated downward bytes to the target level size.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10057
Test Plan: Repurpose tests VersionStorageInfoTest.MaxBytesForLevelDynamicWithLargeL0_* tests to cover this scenario.
Reviewed By: ajkr
Differential Revision: D37539742
fbshipit-source-id: 9c154cbfe92023f918cf5d80875d8776ad4831a4
2022-06-30 20:32:47 +00:00
|
|
|
Add(0, 11U, "1", "2", 2500U);
|
|
|
|
Add(0, 12U, "1", "2", 2500U);
|
|
|
|
Add(0, 13U, "1", "2", 2500U);
|
|
|
|
Add(0, 14U, "1", "2", 2500U);
|
2018-10-22 17:18:51 +00:00
|
|
|
|
Change The Way Level Target And Compaction Score Are Calculated (#10057)
Summary:
The current level targets for dynamical leveling has a problem: the target level size will dramatically change after a L0->L1 compaction. When there are many L0 bytes, lower level compactions are delayed, but they will be resumed after the L0->L1 compaction finishes, so the expected write amplification benefits might not be realized. The proposal here is to revert the level targetting size, but instead relying on adjusting score for each level to prioritize levels that need to compact most.
Basic idea:
(1) target level size isn't adjusted, but score is adjusted. The reasoning is that with parallel compactions, holding compactions from happening might not be desirable, but we would like the compactions are scheduled from the level we feel most needed. For example, if we have a extra-large L2, we would like all compactions are scheduled for L2->L3 compactions, rather than L4->L5. This gets complicated when a large L0->L1 compaction is going on. Should we compact L2->L3 or L4->L5. So the proposal for that is:
(2) the score is calculated by actual level size / (target size + estimated upper bytes coming down). The reasoning is that if we have a large amount of pending L0/L1 bytes coming down, compacting L2->L3 might be more expensive, as when the L0 bytes are compacted down to L2, the actual L2->L3 fanout would change dramatically. On the other hand, when the amount of bytes coming down to L5, the impacts to L5->L6 fanout are much less. So when calculating target score, we can adjust it by adding estimated downward bytes to the target level size.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10057
Test Plan: Repurpose tests VersionStorageInfoTest.MaxBytesForLevelDynamicWithLargeL0_* tests to cover this scenario.
Reviewed By: ajkr
Differential Revision: D37539742
fbshipit-source-id: 9c154cbfe92023f918cf5d80875d8776ad4831a4
2022-06-30 20:32:47 +00:00
|
|
|
// Level size should be around 20,000, 53000, 258000
|
2018-10-22 17:18:51 +00:00
|
|
|
Add(5, 4U, "1", "2", 1286250U);
|
Change The Way Level Target And Compaction Score Are Calculated (#10057)
Summary:
The current level targets for dynamical leveling has a problem: the target level size will dramatically change after a L0->L1 compaction. When there are many L0 bytes, lower level compactions are delayed, but they will be resumed after the L0->L1 compaction finishes, so the expected write amplification benefits might not be realized. The proposal here is to revert the level targetting size, but instead relying on adjusting score for each level to prioritize levels that need to compact most.
Basic idea:
(1) target level size isn't adjusted, but score is adjusted. The reasoning is that with parallel compactions, holding compactions from happening might not be desirable, but we would like the compactions are scheduled from the level we feel most needed. For example, if we have a extra-large L2, we would like all compactions are scheduled for L2->L3 compactions, rather than L4->L5. This gets complicated when a large L0->L1 compaction is going on. Should we compact L2->L3 or L4->L5. So the proposal for that is:
(2) the score is calculated by actual level size / (target size + estimated upper bytes coming down). The reasoning is that if we have a large amount of pending L0/L1 bytes coming down, compacting L2->L3 might be more expensive, as when the L0 bytes are compacted down to L2, the actual L2->L3 fanout would change dramatically. On the other hand, when the amount of bytes coming down to L5, the impacts to L5->L6 fanout are much less. So when calculating target score, we can adjust it by adding estimated downward bytes to the target level size.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10057
Test Plan: Repurpose tests VersionStorageInfoTest.MaxBytesForLevelDynamicWithLargeL0_* tests to cover this scenario.
Reviewed By: ajkr
Differential Revision: D37539742
fbshipit-source-id: 9c154cbfe92023f918cf5d80875d8776ad4831a4
2022-06-30 20:32:47 +00:00
|
|
|
Add(4, 5U, "1", "2", 260000U); // Unadjusted score 1.01, adjusted about 4.3
|
|
|
|
Add(3, 6U, "1", "2", 85000U); // Unadjusted score 1.42, adjusted about 11.6
|
|
|
|
Add(2, 7U, "1", "2", 30000); // Unadjusted score 1.5, adjusted about 10.0
|
2018-10-22 17:18:51 +00:00
|
|
|
|
2022-02-04 16:18:18 +00:00
|
|
|
UpdateVersionStorageInfo();
|
|
|
|
|
2018-10-22 17:18:51 +00:00
|
|
|
ASSERT_EQ(0, logger_->log_count);
|
|
|
|
ASSERT_EQ(2, vstorage_.base_level());
|
Change The Way Level Target And Compaction Score Are Calculated (#10057)
Summary:
The current level targets for dynamical leveling has a problem: the target level size will dramatically change after a L0->L1 compaction. When there are many L0 bytes, lower level compactions are delayed, but they will be resumed after the L0->L1 compaction finishes, so the expected write amplification benefits might not be realized. The proposal here is to revert the level targetting size, but instead relying on adjusting score for each level to prioritize levels that need to compact most.
Basic idea:
(1) target level size isn't adjusted, but score is adjusted. The reasoning is that with parallel compactions, holding compactions from happening might not be desirable, but we would like the compactions are scheduled from the level we feel most needed. For example, if we have a extra-large L2, we would like all compactions are scheduled for L2->L3 compactions, rather than L4->L5. This gets complicated when a large L0->L1 compaction is going on. Should we compact L2->L3 or L4->L5. So the proposal for that is:
(2) the score is calculated by actual level size / (target size + estimated upper bytes coming down). The reasoning is that if we have a large amount of pending L0/L1 bytes coming down, compacting L2->L3 might be more expensive, as when the L0 bytes are compacted down to L2, the actual L2->L3 fanout would change dramatically. On the other hand, when the amount of bytes coming down to L5, the impacts to L5->L6 fanout are much less. So when calculating target score, we can adjust it by adding estimated downward bytes to the target level size.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10057
Test Plan: Repurpose tests VersionStorageInfoTest.MaxBytesForLevelDynamicWithLargeL0_* tests to cover this scenario.
Reviewed By: ajkr
Differential Revision: D37539742
fbshipit-source-id: 9c154cbfe92023f918cf5d80875d8776ad4831a4
2022-06-30 20:32:47 +00:00
|
|
|
ASSERT_EQ(20000U, vstorage_.MaxBytesForLevel(2));
|
|
|
|
|
|
|
|
vstorage_.ComputeCompactionScore(ioptions_, mutable_cf_options_);
|
|
|
|
// Although L2 has higher unadjusted compaction score, considering
|
|
|
|
// a relatively large L0 being compacted down soon, L3 is picked up for
|
|
|
|
// compaction.
|
|
|
|
|
|
|
|
ASSERT_EQ(3, vstorage_.CompactionScoreLevel(0));
|
|
|
|
ASSERT_EQ(2, vstorage_.CompactionScoreLevel(1));
|
|
|
|
ASSERT_EQ(4, vstorage_.CompactionScoreLevel(2));
|
2018-10-22 17:18:51 +00:00
|
|
|
}
|
|
|
|
|
2023-04-06 18:20:43 +00:00
|
|
|
TEST_F(VersionStorageInfoTest, DrainUnnecessaryLevel) {
|
|
|
|
ioptions_.level_compaction_dynamic_level_bytes = true;
|
|
|
|
mutable_cf_options_.max_bytes_for_level_base = 1000;
|
|
|
|
mutable_cf_options_.max_bytes_for_level_multiplier = 10;
|
|
|
|
|
|
|
|
// Create a few unnecessary levels.
|
|
|
|
// See if score is calculated correctly.
|
|
|
|
Add(5, 1U, "1", "2", 2000U); // target size 1010000
|
|
|
|
Add(4, 2U, "1", "2", 200U); // target size 101000
|
|
|
|
// Unnecessary levels
|
|
|
|
Add(3, 3U, "1", "2", 100U); // target size 10100
|
|
|
|
// Level 2: target size 1010
|
|
|
|
Add(1, 4U, "1", "2",
|
|
|
|
10U); // target size 1000 = max(base_bytes_min + 1, base_bytes_max)
|
|
|
|
|
|
|
|
UpdateVersionStorageInfo();
|
|
|
|
|
|
|
|
ASSERT_EQ(1, vstorage_.base_level());
|
|
|
|
ASSERT_EQ(1000, vstorage_.MaxBytesForLevel(1));
|
|
|
|
ASSERT_EQ(10100, vstorage_.MaxBytesForLevel(3));
|
|
|
|
vstorage_.ComputeCompactionScore(ioptions_, mutable_cf_options_);
|
|
|
|
|
|
|
|
// Tests that levels 1 and 3 are eligible for compaction.
|
|
|
|
// Levels 1 and 3 are much smaller than target size,
|
|
|
|
// so size does not contribute to a high compaction score.
|
|
|
|
ASSERT_EQ(1, vstorage_.CompactionScoreLevel(0));
|
|
|
|
ASSERT_GT(vstorage_.CompactionScore(0), 10);
|
|
|
|
ASSERT_EQ(3, vstorage_.CompactionScoreLevel(1));
|
|
|
|
ASSERT_GT(vstorage_.CompactionScore(1), 10);
|
|
|
|
}
|
|
|
|
|
2015-07-22 04:33:20 +00:00
|
|
|
TEST_F(VersionStorageInfoTest, EstimateLiveDataSize) {
|
|
|
|
// Test whether the overlaps are detected as expected
|
|
|
|
Add(1, 1U, "4", "7", 1U); // Perfect overlap with last level
|
|
|
|
Add(2, 2U, "3", "5", 1U); // Partial overlap with last level
|
|
|
|
Add(2, 3U, "6", "8", 1U); // Partial overlap with last level
|
|
|
|
Add(3, 4U, "1", "9", 1U); // Contains range of last level
|
|
|
|
Add(4, 5U, "4", "5", 1U); // Inside range of last level
|
2020-05-28 17:00:19 +00:00
|
|
|
Add(4, 6U, "6", "7", 1U); // Inside range of last level
|
|
|
|
Add(5, 7U, "4", "7", 10U);
|
2022-02-04 16:18:18 +00:00
|
|
|
|
|
|
|
UpdateVersionStorageInfo();
|
|
|
|
|
2015-07-22 04:33:20 +00:00
|
|
|
ASSERT_EQ(10U, vstorage_.EstimateLiveDataSize());
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(VersionStorageInfoTest, EstimateLiveDataSize2) {
|
|
|
|
Add(0, 1U, "9", "9", 1U); // Level 0 is not ordered
|
2020-05-28 17:00:19 +00:00
|
|
|
Add(0, 2U, "5", "6", 1U); // Ignored because of [5,6] in l1
|
|
|
|
Add(1, 3U, "1", "2", 1U); // Ignored because of [2,3] in l2
|
|
|
|
Add(1, 4U, "3", "4", 1U); // Ignored because of [2,3] in l2
|
|
|
|
Add(1, 5U, "5", "6", 1U);
|
|
|
|
Add(2, 6U, "2", "3", 1U);
|
|
|
|
Add(3, 7U, "7", "8", 1U);
|
2022-02-04 16:18:18 +00:00
|
|
|
|
|
|
|
UpdateVersionStorageInfo();
|
|
|
|
|
2015-07-22 04:33:20 +00:00
|
|
|
ASSERT_EQ(4U, vstorage_.EstimateLiveDataSize());
|
|
|
|
}
|
|
|
|
|
2023-12-13 18:37:27 +00:00
|
|
|
TEST_F(VersionStorageInfoTest, SingleLevelBottommostData) {
|
|
|
|
// In case of a single level, the oldest L0 file is bottommost. This could be
|
|
|
|
// improved in case the L0 files cover disjoint key-ranges.
|
|
|
|
Add(0 /* level */, 1U /* file_number */, "A" /* smallest */,
|
|
|
|
"Z" /* largest */, 1U /* file_size */);
|
|
|
|
Add(0 /* level */, 2U /* file_number */, "A" /* smallest */,
|
|
|
|
"Z" /* largest */, 1U /* file_size */);
|
|
|
|
Add(0 /* level */, 3U /* file_number */, "0" /* smallest */,
|
|
|
|
"9" /* largest */, 1U /* file_size */);
|
|
|
|
|
|
|
|
UpdateVersionStorageInfo();
|
|
|
|
|
|
|
|
ASSERT_EQ(1, vstorage_.BottommostFiles().size());
|
|
|
|
ASSERT_EQ(0, vstorage_.BottommostFiles()[0].first);
|
|
|
|
ASSERT_EQ(3U, vstorage_.BottommostFiles()[0].second->fd.GetNumber());
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(VersionStorageInfoTest, MultiLevelBottommostData) {
|
|
|
|
// In case of multiple levels, the oldest file for a key-range from each L1+
|
|
|
|
// level is bottommost. This could be improved in case an L0 file contains the
|
|
|
|
// oldest data for some range of keys.
|
|
|
|
Add(0 /* level */, 1U /* file_number */, "A" /* smallest */,
|
|
|
|
"Z" /* largest */, 1U /* file_size */);
|
|
|
|
Add(0 /* level */, 2U /* file_number */, "0" /* smallest */,
|
|
|
|
"9" /* largest */, 1U /* file_size */);
|
|
|
|
Add(1 /* level */, 3U /* file_number */, "A" /* smallest */,
|
|
|
|
"D" /* largest */, 1U /* file_size */);
|
|
|
|
Add(2 /* level */, 4U /* file_number */, "E" /* smallest */,
|
|
|
|
"H" /* largest */, 1U /* file_size */);
|
|
|
|
Add(2 /* level */, 5U /* file_number */, "I" /* smallest */,
|
|
|
|
"L" /* largest */, 1U /* file_size */);
|
|
|
|
|
|
|
|
UpdateVersionStorageInfo();
|
|
|
|
|
|
|
|
autovector<std::pair<int, FileMetaData*>> bottommost_files =
|
|
|
|
vstorage_.BottommostFiles();
|
|
|
|
std::sort(bottommost_files.begin(), bottommost_files.end(),
|
|
|
|
[](const std::pair<int, FileMetaData*>& lhs,
|
|
|
|
const std::pair<int, FileMetaData*>& rhs) {
|
|
|
|
assert(lhs.second);
|
|
|
|
assert(rhs.second);
|
|
|
|
return lhs.second->fd.GetNumber() < rhs.second->fd.GetNumber();
|
|
|
|
});
|
|
|
|
ASSERT_EQ(3, bottommost_files.size());
|
|
|
|
ASSERT_EQ(3U, bottommost_files[0].second->fd.GetNumber());
|
|
|
|
ASSERT_EQ(4U, bottommost_files[1].second->fd.GetNumber());
|
|
|
|
ASSERT_EQ(5U, bottommost_files[2].second->fd.GetNumber());
|
|
|
|
}
|
|
|
|
|
2018-07-14 00:34:54 +00:00
|
|
|
TEST_F(VersionStorageInfoTest, GetOverlappingInputs) {
|
|
|
|
// Two files that overlap at the range deletion tombstone sentinel.
|
2022-11-02 21:34:24 +00:00
|
|
|
Add(1, 1U, {"a", 0, kTypeValue},
|
|
|
|
{"b", kMaxSequenceNumber, kTypeRangeDeletion}, 1);
|
2018-07-14 00:34:54 +00:00
|
|
|
Add(1, 2U, {"b", 0, kTypeValue}, {"c", 0, kTypeValue}, 1);
|
|
|
|
// Two files that overlap at the same user key.
|
|
|
|
Add(1, 3U, {"d", 0, kTypeValue}, {"e", kMaxSequenceNumber, kTypeValue}, 1);
|
|
|
|
Add(1, 4U, {"e", 0, kTypeValue}, {"f", 0, kTypeValue}, 1);
|
|
|
|
// Two files that do not overlap.
|
|
|
|
Add(1, 5U, {"g", 0, kTypeValue}, {"h", 0, kTypeValue}, 1);
|
|
|
|
Add(1, 6U, {"i", 0, kTypeValue}, {"j", 0, kTypeValue}, 1);
|
2022-02-04 16:18:18 +00:00
|
|
|
|
|
|
|
UpdateVersionStorageInfo();
|
2018-07-14 00:34:54 +00:00
|
|
|
|
2022-11-02 21:34:24 +00:00
|
|
|
ASSERT_EQ("1,2",
|
|
|
|
GetOverlappingFiles(1, {"a", 0, kTypeValue}, {"b", 0, kTypeValue}));
|
|
|
|
ASSERT_EQ("1",
|
|
|
|
GetOverlappingFiles(1, {"a", 0, kTypeValue},
|
|
|
|
{"b", kMaxSequenceNumber, kTypeRangeDeletion}));
|
|
|
|
ASSERT_EQ("2", GetOverlappingFiles(1, {"b", kMaxSequenceNumber, kTypeValue},
|
|
|
|
{"c", 0, kTypeValue}));
|
|
|
|
ASSERT_EQ("3,4",
|
|
|
|
GetOverlappingFiles(1, {"d", 0, kTypeValue}, {"e", 0, kTypeValue}));
|
|
|
|
ASSERT_EQ("3",
|
|
|
|
GetOverlappingFiles(1, {"d", 0, kTypeValue},
|
|
|
|
{"e", kMaxSequenceNumber, kTypeRangeDeletion}));
|
|
|
|
ASSERT_EQ("3,4", GetOverlappingFiles(1, {"e", kMaxSequenceNumber, kTypeValue},
|
|
|
|
{"f", 0, kTypeValue}));
|
|
|
|
ASSERT_EQ("3,4",
|
|
|
|
GetOverlappingFiles(1, {"e", 0, kTypeValue}, {"f", 0, kTypeValue}));
|
|
|
|
ASSERT_EQ("5",
|
|
|
|
GetOverlappingFiles(1, {"g", 0, kTypeValue}, {"h", 0, kTypeValue}));
|
|
|
|
ASSERT_EQ("6",
|
|
|
|
GetOverlappingFiles(1, {"i", 0, kTypeValue}, {"j", 0, kTypeValue}));
|
2018-07-14 00:34:54 +00:00
|
|
|
}
|
|
|
|
|
2020-06-08 22:59:25 +00:00
|
|
|
TEST_F(VersionStorageInfoTest, FileLocationAndMetaDataByNumber) {
|
2020-05-28 17:00:19 +00:00
|
|
|
Add(0, 11U, "1", "2", 5000U);
|
|
|
|
Add(0, 12U, "1", "2", 5000U);
|
|
|
|
|
|
|
|
Add(2, 7U, "1", "2", 8000U);
|
|
|
|
|
2022-02-04 16:18:18 +00:00
|
|
|
UpdateVersionStorageInfo();
|
|
|
|
|
2020-05-28 17:00:19 +00:00
|
|
|
ASSERT_EQ(vstorage_.GetFileLocation(11U),
|
|
|
|
VersionStorageInfo::FileLocation(0, 0));
|
2020-06-08 22:59:25 +00:00
|
|
|
ASSERT_NE(vstorage_.GetFileMetaDataByNumber(11U), nullptr);
|
|
|
|
|
2020-05-28 17:00:19 +00:00
|
|
|
ASSERT_EQ(vstorage_.GetFileLocation(12U),
|
|
|
|
VersionStorageInfo::FileLocation(0, 1));
|
2020-06-08 22:59:25 +00:00
|
|
|
ASSERT_NE(vstorage_.GetFileMetaDataByNumber(12U), nullptr);
|
2020-05-28 17:00:19 +00:00
|
|
|
|
|
|
|
ASSERT_EQ(vstorage_.GetFileLocation(7U),
|
|
|
|
VersionStorageInfo::FileLocation(2, 0));
|
2020-06-08 22:59:25 +00:00
|
|
|
ASSERT_NE(vstorage_.GetFileMetaDataByNumber(7U), nullptr);
|
2020-05-28 17:00:19 +00:00
|
|
|
|
|
|
|
ASSERT_FALSE(vstorage_.GetFileLocation(999U).IsValid());
|
2020-06-08 22:59:25 +00:00
|
|
|
ASSERT_EQ(vstorage_.GetFileMetaDataByNumber(999U), nullptr);
|
2020-05-28 17:00:19 +00:00
|
|
|
}
|
|
|
|
|
Make it possible to force the garbage collection of the oldest blob files (#8994)
Summary:
The current BlobDB garbage collection logic works by relocating the valid
blobs from the oldest blob files as they are encountered during compaction,
and cleaning up blob files once they contain nothing but garbage. However,
with sufficiently skewed workloads, it is theoretically possible to end up in a
situation when few or no compactions get scheduled for the SST files that contain
references to the oldest blob files, which can lead to increased space amp due
to the lack of GC.
In order to efficiently handle such workloads, the patch adds a new BlobDB
configuration option called `blob_garbage_collection_force_threshold`,
which signals to BlobDB to schedule targeted compactions for the SST files
that keep alive the oldest batch of blob files if the overall ratio of garbage in
the given blob files meets the threshold *and* all the given blob files are
eligible for GC based on `blob_garbage_collection_age_cutoff`. (For example,
if the new option is set to 0.9, targeted compactions will get scheduled if the
sum of garbage bytes meets or exceeds 90% of the sum of total bytes in the
oldest blob files, assuming all affected blob files are below the age-based cutoff.)
The net result of these targeted compactions is that the valid blobs in the oldest
blob files are relocated and the oldest blob files themselves cleaned up (since
*all* SST files that rely on them get compacted away).
These targeted compactions are similar to periodic compactions in the sense
that they force certain SST files that otherwise would not get picked up to undergo
compaction and also in the sense that instead of merging files from multiple levels,
they target a single file. (Note: such compactions might still include neighboring files
from the same level due to the need of having a "clean cut" boundary but they never
include any files from any other level.)
This functionality is currently only supported with the leveled compaction style
and is inactive by default (since the default value is set to 1.0, i.e. 100%).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8994
Test Plan: Ran `make check` and tested using `db_bench` and the stress/crash tests.
Reviewed By: riversand963
Differential Revision: D31489850
Pulled By: ltamasi
fbshipit-source-id: 44057d511726a0e2a03c5d9313d7511b3f0c4eab
2021-10-12 01:00:44 +00:00
|
|
|
TEST_F(VersionStorageInfoTest, ForcedBlobGCEmpty) {
|
|
|
|
// No SST or blob files in VersionStorageInfo
|
2022-02-04 16:18:18 +00:00
|
|
|
UpdateVersionStorageInfo();
|
Make it possible to force the garbage collection of the oldest blob files (#8994)
Summary:
The current BlobDB garbage collection logic works by relocating the valid
blobs from the oldest blob files as they are encountered during compaction,
and cleaning up blob files once they contain nothing but garbage. However,
with sufficiently skewed workloads, it is theoretically possible to end up in a
situation when few or no compactions get scheduled for the SST files that contain
references to the oldest blob files, which can lead to increased space amp due
to the lack of GC.
In order to efficiently handle such workloads, the patch adds a new BlobDB
configuration option called `blob_garbage_collection_force_threshold`,
which signals to BlobDB to schedule targeted compactions for the SST files
that keep alive the oldest batch of blob files if the overall ratio of garbage in
the given blob files meets the threshold *and* all the given blob files are
eligible for GC based on `blob_garbage_collection_age_cutoff`. (For example,
if the new option is set to 0.9, targeted compactions will get scheduled if the
sum of garbage bytes meets or exceeds 90% of the sum of total bytes in the
oldest blob files, assuming all affected blob files are below the age-based cutoff.)
The net result of these targeted compactions is that the valid blobs in the oldest
blob files are relocated and the oldest blob files themselves cleaned up (since
*all* SST files that rely on them get compacted away).
These targeted compactions are similar to periodic compactions in the sense
that they force certain SST files that otherwise would not get picked up to undergo
compaction and also in the sense that instead of merging files from multiple levels,
they target a single file. (Note: such compactions might still include neighboring files
from the same level due to the need of having a "clean cut" boundary but they never
include any files from any other level.)
This functionality is currently only supported with the leveled compaction style
and is inactive by default (since the default value is set to 1.0, i.e. 100%).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8994
Test Plan: Ran `make check` and tested using `db_bench` and the stress/crash tests.
Reviewed By: riversand963
Differential Revision: D31489850
Pulled By: ltamasi
fbshipit-source-id: 44057d511726a0e2a03c5d9313d7511b3f0c4eab
2021-10-12 01:00:44 +00:00
|
|
|
|
|
|
|
constexpr double age_cutoff = 0.5;
|
|
|
|
constexpr double force_threshold = 0.75;
|
2023-10-12 22:26:10 +00:00
|
|
|
vstorage_.ComputeFilesMarkedForForcedBlobGC(
|
|
|
|
age_cutoff, force_threshold, /*enable_blob_garbage_collection=*/true);
|
Make it possible to force the garbage collection of the oldest blob files (#8994)
Summary:
The current BlobDB garbage collection logic works by relocating the valid
blobs from the oldest blob files as they are encountered during compaction,
and cleaning up blob files once they contain nothing but garbage. However,
with sufficiently skewed workloads, it is theoretically possible to end up in a
situation when few or no compactions get scheduled for the SST files that contain
references to the oldest blob files, which can lead to increased space amp due
to the lack of GC.
In order to efficiently handle such workloads, the patch adds a new BlobDB
configuration option called `blob_garbage_collection_force_threshold`,
which signals to BlobDB to schedule targeted compactions for the SST files
that keep alive the oldest batch of blob files if the overall ratio of garbage in
the given blob files meets the threshold *and* all the given blob files are
eligible for GC based on `blob_garbage_collection_age_cutoff`. (For example,
if the new option is set to 0.9, targeted compactions will get scheduled if the
sum of garbage bytes meets or exceeds 90% of the sum of total bytes in the
oldest blob files, assuming all affected blob files are below the age-based cutoff.)
The net result of these targeted compactions is that the valid blobs in the oldest
blob files are relocated and the oldest blob files themselves cleaned up (since
*all* SST files that rely on them get compacted away).
These targeted compactions are similar to periodic compactions in the sense
that they force certain SST files that otherwise would not get picked up to undergo
compaction and also in the sense that instead of merging files from multiple levels,
they target a single file. (Note: such compactions might still include neighboring files
from the same level due to the need of having a "clean cut" boundary but they never
include any files from any other level.)
This functionality is currently only supported with the leveled compaction style
and is inactive by default (since the default value is set to 1.0, i.e. 100%).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8994
Test Plan: Ran `make check` and tested using `db_bench` and the stress/crash tests.
Reviewed By: riversand963
Differential Revision: D31489850
Pulled By: ltamasi
fbshipit-source-id: 44057d511726a0e2a03c5d9313d7511b3f0c4eab
2021-10-12 01:00:44 +00:00
|
|
|
|
|
|
|
ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty());
|
|
|
|
}
|
|
|
|
|
2022-02-11 16:24:44 +00:00
|
|
|
TEST_F(VersionStorageInfoTest, ForcedBlobGCSingleBatch) {
|
|
|
|
// Test the edge case when all blob files are part of the oldest batch.
|
|
|
|
// We have one L0 SST file #1, and four blob files #10, #11, #12, and #13.
|
|
|
|
// The oldest blob file used by SST #1 is blob file #10.
|
|
|
|
|
|
|
|
constexpr int level = 0;
|
|
|
|
|
|
|
|
constexpr uint64_t sst = 1;
|
|
|
|
|
|
|
|
constexpr uint64_t first_blob = 10;
|
|
|
|
constexpr uint64_t second_blob = 11;
|
|
|
|
constexpr uint64_t third_blob = 12;
|
|
|
|
constexpr uint64_t fourth_blob = 13;
|
|
|
|
|
|
|
|
{
|
|
|
|
constexpr char smallest[] = "bar1";
|
|
|
|
constexpr char largest[] = "foo1";
|
|
|
|
constexpr uint64_t file_size = 1000;
|
|
|
|
|
|
|
|
Add(level, sst, smallest, largest, file_size, first_blob);
|
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
constexpr uint64_t total_blob_count = 10;
|
|
|
|
constexpr uint64_t total_blob_bytes = 100000;
|
|
|
|
constexpr uint64_t garbage_blob_count = 2;
|
|
|
|
constexpr uint64_t garbage_blob_bytes = 15000;
|
|
|
|
|
|
|
|
AddBlob(first_blob, total_blob_count, total_blob_bytes,
|
|
|
|
BlobFileMetaData::LinkedSsts{sst}, garbage_blob_count,
|
|
|
|
garbage_blob_bytes);
|
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
constexpr uint64_t total_blob_count = 4;
|
|
|
|
constexpr uint64_t total_blob_bytes = 400000;
|
|
|
|
constexpr uint64_t garbage_blob_count = 3;
|
|
|
|
constexpr uint64_t garbage_blob_bytes = 235000;
|
|
|
|
|
|
|
|
AddBlob(second_blob, total_blob_count, total_blob_bytes,
|
|
|
|
BlobFileMetaData::LinkedSsts{}, garbage_blob_count,
|
|
|
|
garbage_blob_bytes);
|
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
constexpr uint64_t total_blob_count = 20;
|
|
|
|
constexpr uint64_t total_blob_bytes = 1000000;
|
|
|
|
constexpr uint64_t garbage_blob_count = 8;
|
|
|
|
constexpr uint64_t garbage_blob_bytes = 400000;
|
|
|
|
|
|
|
|
AddBlob(third_blob, total_blob_count, total_blob_bytes,
|
|
|
|
BlobFileMetaData::LinkedSsts{}, garbage_blob_count,
|
|
|
|
garbage_blob_bytes);
|
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
constexpr uint64_t total_blob_count = 128;
|
|
|
|
constexpr uint64_t total_blob_bytes = 1000000;
|
|
|
|
constexpr uint64_t garbage_blob_count = 67;
|
|
|
|
constexpr uint64_t garbage_blob_bytes = 600000;
|
|
|
|
|
|
|
|
AddBlob(fourth_blob, total_blob_count, total_blob_bytes,
|
|
|
|
BlobFileMetaData::LinkedSsts{}, garbage_blob_count,
|
|
|
|
garbage_blob_bytes);
|
|
|
|
}
|
|
|
|
|
|
|
|
UpdateVersionStorageInfo();
|
|
|
|
|
|
|
|
assert(vstorage_.num_levels() > 0);
|
|
|
|
const auto& level_files = vstorage_.LevelFiles(level);
|
|
|
|
|
|
|
|
assert(level_files.size() == 1);
|
|
|
|
assert(level_files[0] && level_files[0]->fd.GetNumber() == sst);
|
|
|
|
|
|
|
|
// No blob files eligible for GC due to the age cutoff
|
|
|
|
|
|
|
|
{
|
|
|
|
constexpr double age_cutoff = 0.1;
|
|
|
|
constexpr double force_threshold = 0.0;
|
2023-10-12 22:26:10 +00:00
|
|
|
vstorage_.ComputeFilesMarkedForForcedBlobGC(
|
|
|
|
age_cutoff, force_threshold, /*enable_blob_garbage_collection=*/true);
|
2022-02-11 16:24:44 +00:00
|
|
|
|
|
|
|
ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty());
|
|
|
|
}
|
|
|
|
|
2024-09-19 22:47:13 +00:00
|
|
|
// Overall garbage ratio of eligible files is below threshold
|
2022-02-11 16:24:44 +00:00
|
|
|
|
|
|
|
{
|
|
|
|
constexpr double age_cutoff = 1.0;
|
|
|
|
constexpr double force_threshold = 0.6;
|
2023-10-12 22:26:10 +00:00
|
|
|
vstorage_.ComputeFilesMarkedForForcedBlobGC(
|
|
|
|
age_cutoff, force_threshold, /*enable_blob_garbage_collection=*/true);
|
2022-02-11 16:24:44 +00:00
|
|
|
|
|
|
|
ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty());
|
|
|
|
}
|
|
|
|
|
2024-09-19 22:47:13 +00:00
|
|
|
// Overall garbage ratio of eligible files meets threshold
|
2022-02-11 16:24:44 +00:00
|
|
|
|
|
|
|
{
|
|
|
|
constexpr double age_cutoff = 1.0;
|
|
|
|
constexpr double force_threshold = 0.5;
|
2023-10-12 22:26:10 +00:00
|
|
|
vstorage_.ComputeFilesMarkedForForcedBlobGC(
|
|
|
|
age_cutoff, force_threshold, /*enable_blob_garbage_collection=*/true);
|
2022-02-11 16:24:44 +00:00
|
|
|
|
|
|
|
auto ssts_to_be_compacted = vstorage_.FilesMarkedForForcedBlobGC();
|
|
|
|
ASSERT_EQ(ssts_to_be_compacted.size(), 1);
|
|
|
|
|
|
|
|
const autovector<std::pair<int, FileMetaData*>>
|
|
|
|
expected_ssts_to_be_compacted{{level, level_files[0]}};
|
|
|
|
|
|
|
|
ASSERT_EQ(ssts_to_be_compacted[0], expected_ssts_to_be_compacted[0]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(VersionStorageInfoTest, ForcedBlobGCMultipleBatches) {
|
2021-10-12 17:18:42 +00:00
|
|
|
// Add three L0 SSTs (1, 2, and 3) and four blob files (10, 11, 12, and 13).
|
|
|
|
// The first two SSTs have the same oldest blob file, namely, the very oldest
|
|
|
|
// one (10), while the third SST's oldest blob file reference points to the
|
|
|
|
// third blob file (12). Thus, the oldest batch of blob files contains the
|
|
|
|
// first two blob files 10 and 11, and assuming they are eligible for GC based
|
|
|
|
// on the age cutoff, compacting away the SSTs 1 and 2 will eliminate them.
|
Make it possible to force the garbage collection of the oldest blob files (#8994)
Summary:
The current BlobDB garbage collection logic works by relocating the valid
blobs from the oldest blob files as they are encountered during compaction,
and cleaning up blob files once they contain nothing but garbage. However,
with sufficiently skewed workloads, it is theoretically possible to end up in a
situation when few or no compactions get scheduled for the SST files that contain
references to the oldest blob files, which can lead to increased space amp due
to the lack of GC.
In order to efficiently handle such workloads, the patch adds a new BlobDB
configuration option called `blob_garbage_collection_force_threshold`,
which signals to BlobDB to schedule targeted compactions for the SST files
that keep alive the oldest batch of blob files if the overall ratio of garbage in
the given blob files meets the threshold *and* all the given blob files are
eligible for GC based on `blob_garbage_collection_age_cutoff`. (For example,
if the new option is set to 0.9, targeted compactions will get scheduled if the
sum of garbage bytes meets or exceeds 90% of the sum of total bytes in the
oldest blob files, assuming all affected blob files are below the age-based cutoff.)
The net result of these targeted compactions is that the valid blobs in the oldest
blob files are relocated and the oldest blob files themselves cleaned up (since
*all* SST files that rely on them get compacted away).
These targeted compactions are similar to periodic compactions in the sense
that they force certain SST files that otherwise would not get picked up to undergo
compaction and also in the sense that instead of merging files from multiple levels,
they target a single file. (Note: such compactions might still include neighboring files
from the same level due to the need of having a "clean cut" boundary but they never
include any files from any other level.)
This functionality is currently only supported with the leveled compaction style
and is inactive by default (since the default value is set to 1.0, i.e. 100%).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8994
Test Plan: Ran `make check` and tested using `db_bench` and the stress/crash tests.
Reviewed By: riversand963
Differential Revision: D31489850
Pulled By: ltamasi
fbshipit-source-id: 44057d511726a0e2a03c5d9313d7511b3f0c4eab
2021-10-12 01:00:44 +00:00
|
|
|
|
|
|
|
constexpr int level = 0;
|
|
|
|
|
|
|
|
constexpr uint64_t first_sst = 1;
|
|
|
|
constexpr uint64_t second_sst = 2;
|
|
|
|
constexpr uint64_t third_sst = 3;
|
|
|
|
|
|
|
|
constexpr uint64_t first_blob = 10;
|
|
|
|
constexpr uint64_t second_blob = 11;
|
|
|
|
constexpr uint64_t third_blob = 12;
|
|
|
|
constexpr uint64_t fourth_blob = 13;
|
|
|
|
|
|
|
|
{
|
|
|
|
constexpr char smallest[] = "bar1";
|
|
|
|
constexpr char largest[] = "foo1";
|
|
|
|
constexpr uint64_t file_size = 1000;
|
|
|
|
|
|
|
|
Add(level, first_sst, smallest, largest, file_size, first_blob);
|
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
constexpr char smallest[] = "bar2";
|
|
|
|
constexpr char largest[] = "foo2";
|
|
|
|
constexpr uint64_t file_size = 2000;
|
|
|
|
|
|
|
|
Add(level, second_sst, smallest, largest, file_size, first_blob);
|
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
constexpr char smallest[] = "bar3";
|
|
|
|
constexpr char largest[] = "foo3";
|
|
|
|
constexpr uint64_t file_size = 3000;
|
|
|
|
|
|
|
|
Add(level, third_sst, smallest, largest, file_size, third_blob);
|
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
constexpr uint64_t total_blob_count = 10;
|
|
|
|
constexpr uint64_t total_blob_bytes = 100000;
|
|
|
|
constexpr uint64_t garbage_blob_count = 2;
|
|
|
|
constexpr uint64_t garbage_blob_bytes = 15000;
|
|
|
|
|
|
|
|
AddBlob(first_blob, total_blob_count, total_blob_bytes,
|
|
|
|
BlobFileMetaData::LinkedSsts{first_sst, second_sst},
|
|
|
|
garbage_blob_count, garbage_blob_bytes);
|
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
constexpr uint64_t total_blob_count = 4;
|
|
|
|
constexpr uint64_t total_blob_bytes = 400000;
|
|
|
|
constexpr uint64_t garbage_blob_count = 3;
|
|
|
|
constexpr uint64_t garbage_blob_bytes = 235000;
|
|
|
|
|
|
|
|
AddBlob(second_blob, total_blob_count, total_blob_bytes,
|
|
|
|
BlobFileMetaData::LinkedSsts{}, garbage_blob_count,
|
|
|
|
garbage_blob_bytes);
|
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
constexpr uint64_t total_blob_count = 20;
|
|
|
|
constexpr uint64_t total_blob_bytes = 1000000;
|
|
|
|
constexpr uint64_t garbage_blob_count = 8;
|
|
|
|
constexpr uint64_t garbage_blob_bytes = 123456;
|
|
|
|
|
|
|
|
AddBlob(third_blob, total_blob_count, total_blob_bytes,
|
|
|
|
BlobFileMetaData::LinkedSsts{third_sst}, garbage_blob_count,
|
|
|
|
garbage_blob_bytes);
|
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
constexpr uint64_t total_blob_count = 128;
|
|
|
|
constexpr uint64_t total_blob_bytes = 789012345;
|
|
|
|
constexpr uint64_t garbage_blob_count = 67;
|
|
|
|
constexpr uint64_t garbage_blob_bytes = 88888888;
|
|
|
|
|
|
|
|
AddBlob(fourth_blob, total_blob_count, total_blob_bytes,
|
|
|
|
BlobFileMetaData::LinkedSsts{}, garbage_blob_count,
|
|
|
|
garbage_blob_bytes);
|
|
|
|
}
|
|
|
|
|
2022-02-04 16:18:18 +00:00
|
|
|
UpdateVersionStorageInfo();
|
Make it possible to force the garbage collection of the oldest blob files (#8994)
Summary:
The current BlobDB garbage collection logic works by relocating the valid
blobs from the oldest blob files as they are encountered during compaction,
and cleaning up blob files once they contain nothing but garbage. However,
with sufficiently skewed workloads, it is theoretically possible to end up in a
situation when few or no compactions get scheduled for the SST files that contain
references to the oldest blob files, which can lead to increased space amp due
to the lack of GC.
In order to efficiently handle such workloads, the patch adds a new BlobDB
configuration option called `blob_garbage_collection_force_threshold`,
which signals to BlobDB to schedule targeted compactions for the SST files
that keep alive the oldest batch of blob files if the overall ratio of garbage in
the given blob files meets the threshold *and* all the given blob files are
eligible for GC based on `blob_garbage_collection_age_cutoff`. (For example,
if the new option is set to 0.9, targeted compactions will get scheduled if the
sum of garbage bytes meets or exceeds 90% of the sum of total bytes in the
oldest blob files, assuming all affected blob files are below the age-based cutoff.)
The net result of these targeted compactions is that the valid blobs in the oldest
blob files are relocated and the oldest blob files themselves cleaned up (since
*all* SST files that rely on them get compacted away).
These targeted compactions are similar to periodic compactions in the sense
that they force certain SST files that otherwise would not get picked up to undergo
compaction and also in the sense that instead of merging files from multiple levels,
they target a single file. (Note: such compactions might still include neighboring files
from the same level due to the need of having a "clean cut" boundary but they never
include any files from any other level.)
This functionality is currently only supported with the leveled compaction style
and is inactive by default (since the default value is set to 1.0, i.e. 100%).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8994
Test Plan: Ran `make check` and tested using `db_bench` and the stress/crash tests.
Reviewed By: riversand963
Differential Revision: D31489850
Pulled By: ltamasi
fbshipit-source-id: 44057d511726a0e2a03c5d9313d7511b3f0c4eab
2021-10-12 01:00:44 +00:00
|
|
|
|
|
|
|
assert(vstorage_.num_levels() > 0);
|
|
|
|
const auto& level_files = vstorage_.LevelFiles(level);
|
|
|
|
|
|
|
|
assert(level_files.size() == 3);
|
|
|
|
assert(level_files[0] && level_files[0]->fd.GetNumber() == first_sst);
|
|
|
|
assert(level_files[1] && level_files[1]->fd.GetNumber() == second_sst);
|
|
|
|
assert(level_files[2] && level_files[2]->fd.GetNumber() == third_sst);
|
|
|
|
|
|
|
|
// No blob files eligible for GC due to the age cutoff
|
|
|
|
|
|
|
|
{
|
|
|
|
constexpr double age_cutoff = 0.1;
|
|
|
|
constexpr double force_threshold = 0.0;
|
2023-10-12 22:26:10 +00:00
|
|
|
vstorage_.ComputeFilesMarkedForForcedBlobGC(
|
|
|
|
age_cutoff, force_threshold, /*enable_blob_garbage_collection=*/true);
|
Make it possible to force the garbage collection of the oldest blob files (#8994)
Summary:
The current BlobDB garbage collection logic works by relocating the valid
blobs from the oldest blob files as they are encountered during compaction,
and cleaning up blob files once they contain nothing but garbage. However,
with sufficiently skewed workloads, it is theoretically possible to end up in a
situation when few or no compactions get scheduled for the SST files that contain
references to the oldest blob files, which can lead to increased space amp due
to the lack of GC.
In order to efficiently handle such workloads, the patch adds a new BlobDB
configuration option called `blob_garbage_collection_force_threshold`,
which signals to BlobDB to schedule targeted compactions for the SST files
that keep alive the oldest batch of blob files if the overall ratio of garbage in
the given blob files meets the threshold *and* all the given blob files are
eligible for GC based on `blob_garbage_collection_age_cutoff`. (For example,
if the new option is set to 0.9, targeted compactions will get scheduled if the
sum of garbage bytes meets or exceeds 90% of the sum of total bytes in the
oldest blob files, assuming all affected blob files are below the age-based cutoff.)
The net result of these targeted compactions is that the valid blobs in the oldest
blob files are relocated and the oldest blob files themselves cleaned up (since
*all* SST files that rely on them get compacted away).
These targeted compactions are similar to periodic compactions in the sense
that they force certain SST files that otherwise would not get picked up to undergo
compaction and also in the sense that instead of merging files from multiple levels,
they target a single file. (Note: such compactions might still include neighboring files
from the same level due to the need of having a "clean cut" boundary but they never
include any files from any other level.)
This functionality is currently only supported with the leveled compaction style
and is inactive by default (since the default value is set to 1.0, i.e. 100%).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8994
Test Plan: Ran `make check` and tested using `db_bench` and the stress/crash tests.
Reviewed By: riversand963
Differential Revision: D31489850
Pulled By: ltamasi
fbshipit-source-id: 44057d511726a0e2a03c5d9313d7511b3f0c4eab
2021-10-12 01:00:44 +00:00
|
|
|
|
|
|
|
ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty());
|
|
|
|
}
|
|
|
|
|
2024-09-19 22:47:13 +00:00
|
|
|
// Overall garbage ratio of eligible files is below threshold
|
Make it possible to force the garbage collection of the oldest blob files (#8994)
Summary:
The current BlobDB garbage collection logic works by relocating the valid
blobs from the oldest blob files as they are encountered during compaction,
and cleaning up blob files once they contain nothing but garbage. However,
with sufficiently skewed workloads, it is theoretically possible to end up in a
situation when few or no compactions get scheduled for the SST files that contain
references to the oldest blob files, which can lead to increased space amp due
to the lack of GC.
In order to efficiently handle such workloads, the patch adds a new BlobDB
configuration option called `blob_garbage_collection_force_threshold`,
which signals to BlobDB to schedule targeted compactions for the SST files
that keep alive the oldest batch of blob files if the overall ratio of garbage in
the given blob files meets the threshold *and* all the given blob files are
eligible for GC based on `blob_garbage_collection_age_cutoff`. (For example,
if the new option is set to 0.9, targeted compactions will get scheduled if the
sum of garbage bytes meets or exceeds 90% of the sum of total bytes in the
oldest blob files, assuming all affected blob files are below the age-based cutoff.)
The net result of these targeted compactions is that the valid blobs in the oldest
blob files are relocated and the oldest blob files themselves cleaned up (since
*all* SST files that rely on them get compacted away).
These targeted compactions are similar to periodic compactions in the sense
that they force certain SST files that otherwise would not get picked up to undergo
compaction and also in the sense that instead of merging files from multiple levels,
they target a single file. (Note: such compactions might still include neighboring files
from the same level due to the need of having a "clean cut" boundary but they never
include any files from any other level.)
This functionality is currently only supported with the leveled compaction style
and is inactive by default (since the default value is set to 1.0, i.e. 100%).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8994
Test Plan: Ran `make check` and tested using `db_bench` and the stress/crash tests.
Reviewed By: riversand963
Differential Revision: D31489850
Pulled By: ltamasi
fbshipit-source-id: 44057d511726a0e2a03c5d9313d7511b3f0c4eab
2021-10-12 01:00:44 +00:00
|
|
|
|
|
|
|
{
|
|
|
|
constexpr double age_cutoff = 0.5;
|
|
|
|
constexpr double force_threshold = 0.6;
|
2023-10-12 22:26:10 +00:00
|
|
|
vstorage_.ComputeFilesMarkedForForcedBlobGC(
|
|
|
|
age_cutoff, force_threshold, /*enable_blob_garbage_collection=*/true);
|
Make it possible to force the garbage collection of the oldest blob files (#8994)
Summary:
The current BlobDB garbage collection logic works by relocating the valid
blobs from the oldest blob files as they are encountered during compaction,
and cleaning up blob files once they contain nothing but garbage. However,
with sufficiently skewed workloads, it is theoretically possible to end up in a
situation when few or no compactions get scheduled for the SST files that contain
references to the oldest blob files, which can lead to increased space amp due
to the lack of GC.
In order to efficiently handle such workloads, the patch adds a new BlobDB
configuration option called `blob_garbage_collection_force_threshold`,
which signals to BlobDB to schedule targeted compactions for the SST files
that keep alive the oldest batch of blob files if the overall ratio of garbage in
the given blob files meets the threshold *and* all the given blob files are
eligible for GC based on `blob_garbage_collection_age_cutoff`. (For example,
if the new option is set to 0.9, targeted compactions will get scheduled if the
sum of garbage bytes meets or exceeds 90% of the sum of total bytes in the
oldest blob files, assuming all affected blob files are below the age-based cutoff.)
The net result of these targeted compactions is that the valid blobs in the oldest
blob files are relocated and the oldest blob files themselves cleaned up (since
*all* SST files that rely on them get compacted away).
These targeted compactions are similar to periodic compactions in the sense
that they force certain SST files that otherwise would not get picked up to undergo
compaction and also in the sense that instead of merging files from multiple levels,
they target a single file. (Note: such compactions might still include neighboring files
from the same level due to the need of having a "clean cut" boundary but they never
include any files from any other level.)
This functionality is currently only supported with the leveled compaction style
and is inactive by default (since the default value is set to 1.0, i.e. 100%).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8994
Test Plan: Ran `make check` and tested using `db_bench` and the stress/crash tests.
Reviewed By: riversand963
Differential Revision: D31489850
Pulled By: ltamasi
fbshipit-source-id: 44057d511726a0e2a03c5d9313d7511b3f0c4eab
2021-10-12 01:00:44 +00:00
|
|
|
|
|
|
|
ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty());
|
|
|
|
}
|
|
|
|
|
2024-09-19 22:47:13 +00:00
|
|
|
// Overall garbage ratio of eligible files meets threshold
|
Make it possible to force the garbage collection of the oldest blob files (#8994)
Summary:
The current BlobDB garbage collection logic works by relocating the valid
blobs from the oldest blob files as they are encountered during compaction,
and cleaning up blob files once they contain nothing but garbage. However,
with sufficiently skewed workloads, it is theoretically possible to end up in a
situation when few or no compactions get scheduled for the SST files that contain
references to the oldest blob files, which can lead to increased space amp due
to the lack of GC.
In order to efficiently handle such workloads, the patch adds a new BlobDB
configuration option called `blob_garbage_collection_force_threshold`,
which signals to BlobDB to schedule targeted compactions for the SST files
that keep alive the oldest batch of blob files if the overall ratio of garbage in
the given blob files meets the threshold *and* all the given blob files are
eligible for GC based on `blob_garbage_collection_age_cutoff`. (For example,
if the new option is set to 0.9, targeted compactions will get scheduled if the
sum of garbage bytes meets or exceeds 90% of the sum of total bytes in the
oldest blob files, assuming all affected blob files are below the age-based cutoff.)
The net result of these targeted compactions is that the valid blobs in the oldest
blob files are relocated and the oldest blob files themselves cleaned up (since
*all* SST files that rely on them get compacted away).
These targeted compactions are similar to periodic compactions in the sense
that they force certain SST files that otherwise would not get picked up to undergo
compaction and also in the sense that instead of merging files from multiple levels,
they target a single file. (Note: such compactions might still include neighboring files
from the same level due to the need of having a "clean cut" boundary but they never
include any files from any other level.)
This functionality is currently only supported with the leveled compaction style
and is inactive by default (since the default value is set to 1.0, i.e. 100%).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8994
Test Plan: Ran `make check` and tested using `db_bench` and the stress/crash tests.
Reviewed By: riversand963
Differential Revision: D31489850
Pulled By: ltamasi
fbshipit-source-id: 44057d511726a0e2a03c5d9313d7511b3f0c4eab
2021-10-12 01:00:44 +00:00
|
|
|
|
|
|
|
{
|
|
|
|
constexpr double age_cutoff = 0.5;
|
|
|
|
constexpr double force_threshold = 0.5;
|
2023-10-12 22:26:10 +00:00
|
|
|
vstorage_.ComputeFilesMarkedForForcedBlobGC(
|
|
|
|
age_cutoff, force_threshold, /*enable_blob_garbage_collection=*/true);
|
2022-02-11 16:24:44 +00:00
|
|
|
|
|
|
|
auto ssts_to_be_compacted = vstorage_.FilesMarkedForForcedBlobGC();
|
|
|
|
ASSERT_EQ(ssts_to_be_compacted.size(), 2);
|
|
|
|
|
|
|
|
std::sort(ssts_to_be_compacted.begin(), ssts_to_be_compacted.end(),
|
|
|
|
[](const std::pair<int, FileMetaData*>& lhs,
|
|
|
|
const std::pair<int, FileMetaData*>& rhs) {
|
|
|
|
assert(lhs.second);
|
|
|
|
assert(rhs.second);
|
|
|
|
return lhs.second->fd.GetNumber() < rhs.second->fd.GetNumber();
|
|
|
|
});
|
|
|
|
|
|
|
|
const autovector<std::pair<int, FileMetaData*>>
|
|
|
|
expected_ssts_to_be_compacted{{level, level_files[0]},
|
|
|
|
{level, level_files[1]}};
|
|
|
|
|
|
|
|
ASSERT_EQ(ssts_to_be_compacted[0], expected_ssts_to_be_compacted[0]);
|
Make it possible to force the garbage collection of the oldest blob files (#8994)
Summary:
The current BlobDB garbage collection logic works by relocating the valid
blobs from the oldest blob files as they are encountered during compaction,
and cleaning up blob files once they contain nothing but garbage. However,
with sufficiently skewed workloads, it is theoretically possible to end up in a
situation when few or no compactions get scheduled for the SST files that contain
references to the oldest blob files, which can lead to increased space amp due
to the lack of GC.
In order to efficiently handle such workloads, the patch adds a new BlobDB
configuration option called `blob_garbage_collection_force_threshold`,
which signals to BlobDB to schedule targeted compactions for the SST files
that keep alive the oldest batch of blob files if the overall ratio of garbage in
the given blob files meets the threshold *and* all the given blob files are
eligible for GC based on `blob_garbage_collection_age_cutoff`. (For example,
if the new option is set to 0.9, targeted compactions will get scheduled if the
sum of garbage bytes meets or exceeds 90% of the sum of total bytes in the
oldest blob files, assuming all affected blob files are below the age-based cutoff.)
The net result of these targeted compactions is that the valid blobs in the oldest
blob files are relocated and the oldest blob files themselves cleaned up (since
*all* SST files that rely on them get compacted away).
These targeted compactions are similar to periodic compactions in the sense
that they force certain SST files that otherwise would not get picked up to undergo
compaction and also in the sense that instead of merging files from multiple levels,
they target a single file. (Note: such compactions might still include neighboring files
from the same level due to the need of having a "clean cut" boundary but they never
include any files from any other level.)
This functionality is currently only supported with the leveled compaction style
and is inactive by default (since the default value is set to 1.0, i.e. 100%).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8994
Test Plan: Ran `make check` and tested using `db_bench` and the stress/crash tests.
Reviewed By: riversand963
Differential Revision: D31489850
Pulled By: ltamasi
fbshipit-source-id: 44057d511726a0e2a03c5d9313d7511b3f0c4eab
2021-10-12 01:00:44 +00:00
|
|
|
ASSERT_EQ(ssts_to_be_compacted[1], expected_ssts_to_be_compacted[1]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-04-10 23:03:33 +00:00
|
|
|
class VersionStorageInfoTimestampTest : public VersionStorageInfoTestBase {
|
|
|
|
public:
|
|
|
|
VersionStorageInfoTimestampTest()
|
2022-02-08 20:14:25 +00:00
|
|
|
: VersionStorageInfoTestBase(test::BytewiseComparatorWithU64TsWrapper()) {
|
|
|
|
}
|
2023-12-04 19:17:32 +00:00
|
|
|
~VersionStorageInfoTimestampTest() override = default;
|
2020-04-10 23:03:33 +00:00
|
|
|
std::string Timestamp(uint64_t ts) const {
|
|
|
|
std::string ret;
|
|
|
|
PutFixed64(&ret, ts);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
std::string PackUserKeyAndTimestamp(const Slice& ukey, uint64_t ts) const {
|
|
|
|
std::string ret;
|
|
|
|
ret.assign(ukey.data(), ukey.size());
|
|
|
|
PutFixed64(&ret, ts);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
TEST_F(VersionStorageInfoTimestampTest, GetOverlappingInputs) {
|
|
|
|
Add(/*level=*/1, /*file_number=*/1, /*smallest=*/
|
|
|
|
{PackUserKeyAndTimestamp("a", /*ts=*/9), /*s=*/0, kTypeValue},
|
|
|
|
/*largest=*/
|
|
|
|
{PackUserKeyAndTimestamp("a", /*ts=*/8), /*s=*/0, kTypeValue},
|
|
|
|
/*file_size=*/100);
|
|
|
|
Add(/*level=*/1, /*file_number=*/2, /*smallest=*/
|
|
|
|
{PackUserKeyAndTimestamp("a", /*ts=*/5), /*s=*/0, kTypeValue},
|
|
|
|
/*largest=*/
|
|
|
|
{PackUserKeyAndTimestamp("b", /*ts=*/10), /*s=*/0, kTypeValue},
|
|
|
|
/*file_size=*/100);
|
|
|
|
Add(/*level=*/1, /*file_number=*/3, /*smallest=*/
|
|
|
|
{PackUserKeyAndTimestamp("c", /*ts=*/12), /*s=*/0, kTypeValue},
|
|
|
|
/*largest=*/
|
|
|
|
{PackUserKeyAndTimestamp("d", /*ts=*/1), /*s=*/0, kTypeValue},
|
|
|
|
/*file_size=*/100);
|
2022-02-04 16:18:18 +00:00
|
|
|
|
|
|
|
UpdateVersionStorageInfo();
|
|
|
|
|
2020-04-10 23:03:33 +00:00
|
|
|
ASSERT_EQ(
|
|
|
|
"1,2",
|
|
|
|
GetOverlappingFiles(
|
|
|
|
/*level=*/1,
|
|
|
|
{PackUserKeyAndTimestamp("a", /*ts=*/12), /*s=*/0, kTypeValue},
|
|
|
|
{PackUserKeyAndTimestamp("a", /*ts=*/11), /*s=*/0, kTypeValue}));
|
|
|
|
ASSERT_EQ("3",
|
|
|
|
GetOverlappingFiles(
|
|
|
|
/*level=*/1,
|
|
|
|
{PackUserKeyAndTimestamp("c", /*ts=*/15), /*s=*/0, kTypeValue},
|
|
|
|
{PackUserKeyAndTimestamp("c", /*ts=*/2), /*s=*/0, kTypeValue}));
|
|
|
|
}
|
2018-07-14 00:34:54 +00:00
|
|
|
|
2015-03-17 21:08:00 +00:00
|
|
|
class FindLevelFileTest : public testing::Test {
|
create compressed_levels_ in Version, allocate its space using arena. Make Version::Get, Version::FindFile faster
Summary:
Define CompressedFileMetaData that just contains fd, smallest_slice, largest_slice. Create compressed_levels_ in Version, the space is allocated using arena
Thus increase the file meta data locality, speed up "Get" and "FindFile"
benchmark with in-memory tmpfs, could have 4% improvement under "random read" and 2% improvement under "read while writing"
benchmark command:
./db_bench --db=/mnt/db/rocksdb --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --block_size=4096 --cache_size=17179869184 --cache_numshardbits=6 --compression_type=none --compression_ratio=1 --min_level_to_compress=-1 --disable_seek_compaction=1 --hard_rate_limit=2 --write_buffer_size=134217728 --max_write_buffer_number=2 --level0_file_num_compaction_trigger=8 --target_file_size_base=33554432 --max_bytes_for_level_base=1073741824 --disable_wal=0 --sync=0 --disable_data_sync=1 --verify_checksum=1 --delete_obsolete_files_period_micros=314572800 --max_grandparent_overlap_factor=10 --max_background_compactions=4 --max_background_flushes=0 --level0_slowdown_writes_trigger=16 --level0_stop_writes_trigger=24 --statistics=0 --stats_per_interval=0 --stats_interval=1048576 --histogram=0 --use_plain_table=1 --open_files=-1 --mmap_read=1 --mmap_write=0 --memtablerep=prefix_hash --bloom_bits=10 --bloom_locality=1 --perf_level=0 --benchmarks=readwhilewriting,readwhilewriting,readwhilewriting --use_existing_db=1 --num=52428800 --threads=1 —writes_per_second=81920
Read Random:
From 1.8363 ms/op, improve to 1.7587 ms/op.
Read while writing:
From 2.985 ms/op, improve to 2.924 ms/op.
Test Plan:
make all check
Reviewers: ljin, haobo, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, igor
Differential Revision: https://reviews.facebook.net/D19419
2014-07-10 05:14:39 +00:00
|
|
|
public:
|
2014-10-28 17:03:13 +00:00
|
|
|
LevelFilesBrief file_level_;
|
create compressed_levels_ in Version, allocate its space using arena. Make Version::Get, Version::FindFile faster
Summary:
Define CompressedFileMetaData that just contains fd, smallest_slice, largest_slice. Create compressed_levels_ in Version, the space is allocated using arena
Thus increase the file meta data locality, speed up "Get" and "FindFile"
benchmark with in-memory tmpfs, could have 4% improvement under "random read" and 2% improvement under "read while writing"
benchmark command:
./db_bench --db=/mnt/db/rocksdb --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --block_size=4096 --cache_size=17179869184 --cache_numshardbits=6 --compression_type=none --compression_ratio=1 --min_level_to_compress=-1 --disable_seek_compaction=1 --hard_rate_limit=2 --write_buffer_size=134217728 --max_write_buffer_number=2 --level0_file_num_compaction_trigger=8 --target_file_size_base=33554432 --max_bytes_for_level_base=1073741824 --disable_wal=0 --sync=0 --disable_data_sync=1 --verify_checksum=1 --delete_obsolete_files_period_micros=314572800 --max_grandparent_overlap_factor=10 --max_background_compactions=4 --max_background_flushes=0 --level0_slowdown_writes_trigger=16 --level0_stop_writes_trigger=24 --statistics=0 --stats_per_interval=0 --stats_interval=1048576 --histogram=0 --use_plain_table=1 --open_files=-1 --mmap_read=1 --mmap_write=0 --memtablerep=prefix_hash --bloom_bits=10 --bloom_locality=1 --perf_level=0 --benchmarks=readwhilewriting,readwhilewriting,readwhilewriting --use_existing_db=1 --num=52428800 --threads=1 —writes_per_second=81920
Read Random:
From 1.8363 ms/op, improve to 1.7587 ms/op.
Read while writing:
From 2.985 ms/op, improve to 2.924 ms/op.
Test Plan:
make all check
Reviewers: ljin, haobo, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, igor
Differential Revision: https://reviews.facebook.net/D19419
2014-07-10 05:14:39 +00:00
|
|
|
bool disjoint_sorted_files_;
|
|
|
|
Arena arena_;
|
|
|
|
|
2022-11-02 21:34:24 +00:00
|
|
|
FindLevelFileTest() : disjoint_sorted_files_(true) {}
|
create compressed_levels_ in Version, allocate its space using arena. Make Version::Get, Version::FindFile faster
Summary:
Define CompressedFileMetaData that just contains fd, smallest_slice, largest_slice. Create compressed_levels_ in Version, the space is allocated using arena
Thus increase the file meta data locality, speed up "Get" and "FindFile"
benchmark with in-memory tmpfs, could have 4% improvement under "random read" and 2% improvement under "read while writing"
benchmark command:
./db_bench --db=/mnt/db/rocksdb --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --block_size=4096 --cache_size=17179869184 --cache_numshardbits=6 --compression_type=none --compression_ratio=1 --min_level_to_compress=-1 --disable_seek_compaction=1 --hard_rate_limit=2 --write_buffer_size=134217728 --max_write_buffer_number=2 --level0_file_num_compaction_trigger=8 --target_file_size_base=33554432 --max_bytes_for_level_base=1073741824 --disable_wal=0 --sync=0 --disable_data_sync=1 --verify_checksum=1 --delete_obsolete_files_period_micros=314572800 --max_grandparent_overlap_factor=10 --max_background_compactions=4 --max_background_flushes=0 --level0_slowdown_writes_trigger=16 --level0_stop_writes_trigger=24 --statistics=0 --stats_per_interval=0 --stats_interval=1048576 --histogram=0 --use_plain_table=1 --open_files=-1 --mmap_read=1 --mmap_write=0 --memtablerep=prefix_hash --bloom_bits=10 --bloom_locality=1 --perf_level=0 --benchmarks=readwhilewriting,readwhilewriting,readwhilewriting --use_existing_db=1 --num=52428800 --threads=1 —writes_per_second=81920
Read Random:
From 1.8363 ms/op, improve to 1.7587 ms/op.
Read while writing:
From 2.985 ms/op, improve to 2.924 ms/op.
Test Plan:
make all check
Reviewers: ljin, haobo, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, igor
Differential Revision: https://reviews.facebook.net/D19419
2014-07-10 05:14:39 +00:00
|
|
|
|
2023-12-04 19:17:32 +00:00
|
|
|
~FindLevelFileTest() override = default;
|
create compressed_levels_ in Version, allocate its space using arena. Make Version::Get, Version::FindFile faster
Summary:
Define CompressedFileMetaData that just contains fd, smallest_slice, largest_slice. Create compressed_levels_ in Version, the space is allocated using arena
Thus increase the file meta data locality, speed up "Get" and "FindFile"
benchmark with in-memory tmpfs, could have 4% improvement under "random read" and 2% improvement under "read while writing"
benchmark command:
./db_bench --db=/mnt/db/rocksdb --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --block_size=4096 --cache_size=17179869184 --cache_numshardbits=6 --compression_type=none --compression_ratio=1 --min_level_to_compress=-1 --disable_seek_compaction=1 --hard_rate_limit=2 --write_buffer_size=134217728 --max_write_buffer_number=2 --level0_file_num_compaction_trigger=8 --target_file_size_base=33554432 --max_bytes_for_level_base=1073741824 --disable_wal=0 --sync=0 --disable_data_sync=1 --verify_checksum=1 --delete_obsolete_files_period_micros=314572800 --max_grandparent_overlap_factor=10 --max_background_compactions=4 --max_background_flushes=0 --level0_slowdown_writes_trigger=16 --level0_stop_writes_trigger=24 --statistics=0 --stats_per_interval=0 --stats_interval=1048576 --histogram=0 --use_plain_table=1 --open_files=-1 --mmap_read=1 --mmap_write=0 --memtablerep=prefix_hash --bloom_bits=10 --bloom_locality=1 --perf_level=0 --benchmarks=readwhilewriting,readwhilewriting,readwhilewriting --use_existing_db=1 --num=52428800 --threads=1 —writes_per_second=81920
Read Random:
From 1.8363 ms/op, improve to 1.7587 ms/op.
Read while writing:
From 2.985 ms/op, improve to 2.924 ms/op.
Test Plan:
make all check
Reviewers: ljin, haobo, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, igor
Differential Revision: https://reviews.facebook.net/D19419
2014-07-10 05:14:39 +00:00
|
|
|
|
|
|
|
void LevelFileInit(size_t num = 0) {
|
|
|
|
char* mem = arena_.AllocateAligned(num * sizeof(FdWithKeyRange));
|
2022-11-02 21:34:24 +00:00
|
|
|
file_level_.files = new (mem) FdWithKeyRange[num];
|
2014-07-11 19:52:41 +00:00
|
|
|
file_level_.num_files = 0;
|
create compressed_levels_ in Version, allocate its space using arena. Make Version::Get, Version::FindFile faster
Summary:
Define CompressedFileMetaData that just contains fd, smallest_slice, largest_slice. Create compressed_levels_ in Version, the space is allocated using arena
Thus increase the file meta data locality, speed up "Get" and "FindFile"
benchmark with in-memory tmpfs, could have 4% improvement under "random read" and 2% improvement under "read while writing"
benchmark command:
./db_bench --db=/mnt/db/rocksdb --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --block_size=4096 --cache_size=17179869184 --cache_numshardbits=6 --compression_type=none --compression_ratio=1 --min_level_to_compress=-1 --disable_seek_compaction=1 --hard_rate_limit=2 --write_buffer_size=134217728 --max_write_buffer_number=2 --level0_file_num_compaction_trigger=8 --target_file_size_base=33554432 --max_bytes_for_level_base=1073741824 --disable_wal=0 --sync=0 --disable_data_sync=1 --verify_checksum=1 --delete_obsolete_files_period_micros=314572800 --max_grandparent_overlap_factor=10 --max_background_compactions=4 --max_background_flushes=0 --level0_slowdown_writes_trigger=16 --level0_stop_writes_trigger=24 --statistics=0 --stats_per_interval=0 --stats_interval=1048576 --histogram=0 --use_plain_table=1 --open_files=-1 --mmap_read=1 --mmap_write=0 --memtablerep=prefix_hash --bloom_bits=10 --bloom_locality=1 --perf_level=0 --benchmarks=readwhilewriting,readwhilewriting,readwhilewriting --use_existing_db=1 --num=52428800 --threads=1 —writes_per_second=81920
Read Random:
From 1.8363 ms/op, improve to 1.7587 ms/op.
Read while writing:
From 2.985 ms/op, improve to 2.924 ms/op.
Test Plan:
make all check
Reviewers: ljin, haobo, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, igor
Differential Revision: https://reviews.facebook.net/D19419
2014-07-10 05:14:39 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void Add(const char* smallest, const char* largest,
|
|
|
|
SequenceNumber smallest_seq = 100,
|
|
|
|
SequenceNumber largest_seq = 100) {
|
|
|
|
InternalKey smallest_key = InternalKey(smallest, smallest_seq, kTypeValue);
|
|
|
|
InternalKey largest_key = InternalKey(largest, largest_seq, kTypeValue);
|
|
|
|
|
|
|
|
Slice smallest_slice = smallest_key.Encode();
|
|
|
|
Slice largest_slice = largest_key.Encode();
|
|
|
|
|
2022-11-02 21:34:24 +00:00
|
|
|
char* mem =
|
|
|
|
arena_.AllocateAligned(smallest_slice.size() + largest_slice.size());
|
create compressed_levels_ in Version, allocate its space using arena. Make Version::Get, Version::FindFile faster
Summary:
Define CompressedFileMetaData that just contains fd, smallest_slice, largest_slice. Create compressed_levels_ in Version, the space is allocated using arena
Thus increase the file meta data locality, speed up "Get" and "FindFile"
benchmark with in-memory tmpfs, could have 4% improvement under "random read" and 2% improvement under "read while writing"
benchmark command:
./db_bench --db=/mnt/db/rocksdb --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --block_size=4096 --cache_size=17179869184 --cache_numshardbits=6 --compression_type=none --compression_ratio=1 --min_level_to_compress=-1 --disable_seek_compaction=1 --hard_rate_limit=2 --write_buffer_size=134217728 --max_write_buffer_number=2 --level0_file_num_compaction_trigger=8 --target_file_size_base=33554432 --max_bytes_for_level_base=1073741824 --disable_wal=0 --sync=0 --disable_data_sync=1 --verify_checksum=1 --delete_obsolete_files_period_micros=314572800 --max_grandparent_overlap_factor=10 --max_background_compactions=4 --max_background_flushes=0 --level0_slowdown_writes_trigger=16 --level0_stop_writes_trigger=24 --statistics=0 --stats_per_interval=0 --stats_interval=1048576 --histogram=0 --use_plain_table=1 --open_files=-1 --mmap_read=1 --mmap_write=0 --memtablerep=prefix_hash --bloom_bits=10 --bloom_locality=1 --perf_level=0 --benchmarks=readwhilewriting,readwhilewriting,readwhilewriting --use_existing_db=1 --num=52428800 --threads=1 —writes_per_second=81920
Read Random:
From 1.8363 ms/op, improve to 1.7587 ms/op.
Read while writing:
From 2.985 ms/op, improve to 2.924 ms/op.
Test Plan:
make all check
Reviewers: ljin, haobo, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, igor
Differential Revision: https://reviews.facebook.net/D19419
2014-07-10 05:14:39 +00:00
|
|
|
memcpy(mem, smallest_slice.data(), smallest_slice.size());
|
|
|
|
memcpy(mem + smallest_slice.size(), largest_slice.data(),
|
2022-11-02 21:34:24 +00:00
|
|
|
largest_slice.size());
|
create compressed_levels_ in Version, allocate its space using arena. Make Version::Get, Version::FindFile faster
Summary:
Define CompressedFileMetaData that just contains fd, smallest_slice, largest_slice. Create compressed_levels_ in Version, the space is allocated using arena
Thus increase the file meta data locality, speed up "Get" and "FindFile"
benchmark with in-memory tmpfs, could have 4% improvement under "random read" and 2% improvement under "read while writing"
benchmark command:
./db_bench --db=/mnt/db/rocksdb --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --block_size=4096 --cache_size=17179869184 --cache_numshardbits=6 --compression_type=none --compression_ratio=1 --min_level_to_compress=-1 --disable_seek_compaction=1 --hard_rate_limit=2 --write_buffer_size=134217728 --max_write_buffer_number=2 --level0_file_num_compaction_trigger=8 --target_file_size_base=33554432 --max_bytes_for_level_base=1073741824 --disable_wal=0 --sync=0 --disable_data_sync=1 --verify_checksum=1 --delete_obsolete_files_period_micros=314572800 --max_grandparent_overlap_factor=10 --max_background_compactions=4 --max_background_flushes=0 --level0_slowdown_writes_trigger=16 --level0_stop_writes_trigger=24 --statistics=0 --stats_per_interval=0 --stats_interval=1048576 --histogram=0 --use_plain_table=1 --open_files=-1 --mmap_read=1 --mmap_write=0 --memtablerep=prefix_hash --bloom_bits=10 --bloom_locality=1 --perf_level=0 --benchmarks=readwhilewriting,readwhilewriting,readwhilewriting --use_existing_db=1 --num=52428800 --threads=1 —writes_per_second=81920
Read Random:
From 1.8363 ms/op, improve to 1.7587 ms/op.
Read while writing:
From 2.985 ms/op, improve to 2.924 ms/op.
Test Plan:
make all check
Reviewers: ljin, haobo, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, igor
Differential Revision: https://reviews.facebook.net/D19419
2014-07-10 05:14:39 +00:00
|
|
|
|
2014-07-11 19:52:41 +00:00
|
|
|
// add to file_level_
|
|
|
|
size_t num = file_level_.num_files;
|
|
|
|
auto& file = file_level_.files[num];
|
2014-07-10 06:40:03 +00:00
|
|
|
file.fd = FileDescriptor(num + 1, 0, 0);
|
create compressed_levels_ in Version, allocate its space using arena. Make Version::Get, Version::FindFile faster
Summary:
Define CompressedFileMetaData that just contains fd, smallest_slice, largest_slice. Create compressed_levels_ in Version, the space is allocated using arena
Thus increase the file meta data locality, speed up "Get" and "FindFile"
benchmark with in-memory tmpfs, could have 4% improvement under "random read" and 2% improvement under "read while writing"
benchmark command:
./db_bench --db=/mnt/db/rocksdb --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --block_size=4096 --cache_size=17179869184 --cache_numshardbits=6 --compression_type=none --compression_ratio=1 --min_level_to_compress=-1 --disable_seek_compaction=1 --hard_rate_limit=2 --write_buffer_size=134217728 --max_write_buffer_number=2 --level0_file_num_compaction_trigger=8 --target_file_size_base=33554432 --max_bytes_for_level_base=1073741824 --disable_wal=0 --sync=0 --disable_data_sync=1 --verify_checksum=1 --delete_obsolete_files_period_micros=314572800 --max_grandparent_overlap_factor=10 --max_background_compactions=4 --max_background_flushes=0 --level0_slowdown_writes_trigger=16 --level0_stop_writes_trigger=24 --statistics=0 --stats_per_interval=0 --stats_interval=1048576 --histogram=0 --use_plain_table=1 --open_files=-1 --mmap_read=1 --mmap_write=0 --memtablerep=prefix_hash --bloom_bits=10 --bloom_locality=1 --perf_level=0 --benchmarks=readwhilewriting,readwhilewriting,readwhilewriting --use_existing_db=1 --num=52428800 --threads=1 —writes_per_second=81920
Read Random:
From 1.8363 ms/op, improve to 1.7587 ms/op.
Read while writing:
From 2.985 ms/op, improve to 2.924 ms/op.
Test Plan:
make all check
Reviewers: ljin, haobo, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, igor
Differential Revision: https://reviews.facebook.net/D19419
2014-07-10 05:14:39 +00:00
|
|
|
file.smallest_key = Slice(mem, smallest_slice.size());
|
2022-11-02 21:34:24 +00:00
|
|
|
file.largest_key = Slice(mem + smallest_slice.size(), largest_slice.size());
|
2014-07-11 19:52:41 +00:00
|
|
|
file_level_.num_files++;
|
create compressed_levels_ in Version, allocate its space using arena. Make Version::Get, Version::FindFile faster
Summary:
Define CompressedFileMetaData that just contains fd, smallest_slice, largest_slice. Create compressed_levels_ in Version, the space is allocated using arena
Thus increase the file meta data locality, speed up "Get" and "FindFile"
benchmark with in-memory tmpfs, could have 4% improvement under "random read" and 2% improvement under "read while writing"
benchmark command:
./db_bench --db=/mnt/db/rocksdb --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --block_size=4096 --cache_size=17179869184 --cache_numshardbits=6 --compression_type=none --compression_ratio=1 --min_level_to_compress=-1 --disable_seek_compaction=1 --hard_rate_limit=2 --write_buffer_size=134217728 --max_write_buffer_number=2 --level0_file_num_compaction_trigger=8 --target_file_size_base=33554432 --max_bytes_for_level_base=1073741824 --disable_wal=0 --sync=0 --disable_data_sync=1 --verify_checksum=1 --delete_obsolete_files_period_micros=314572800 --max_grandparent_overlap_factor=10 --max_background_compactions=4 --max_background_flushes=0 --level0_slowdown_writes_trigger=16 --level0_stop_writes_trigger=24 --statistics=0 --stats_per_interval=0 --stats_interval=1048576 --histogram=0 --use_plain_table=1 --open_files=-1 --mmap_read=1 --mmap_write=0 --memtablerep=prefix_hash --bloom_bits=10 --bloom_locality=1 --perf_level=0 --benchmarks=readwhilewriting,readwhilewriting,readwhilewriting --use_existing_db=1 --num=52428800 --threads=1 —writes_per_second=81920
Read Random:
From 1.8363 ms/op, improve to 1.7587 ms/op.
Read while writing:
From 2.985 ms/op, improve to 2.924 ms/op.
Test Plan:
make all check
Reviewers: ljin, haobo, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, igor
Differential Revision: https://reviews.facebook.net/D19419
2014-07-10 05:14:39 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
int Find(const char* key) {
|
|
|
|
InternalKey target(key, 100, kTypeValue);
|
|
|
|
InternalKeyComparator cmp(BytewiseComparator());
|
2014-07-11 19:52:41 +00:00
|
|
|
return FindFile(cmp, file_level_, target.Encode());
|
create compressed_levels_ in Version, allocate its space using arena. Make Version::Get, Version::FindFile faster
Summary:
Define CompressedFileMetaData that just contains fd, smallest_slice, largest_slice. Create compressed_levels_ in Version, the space is allocated using arena
Thus increase the file meta data locality, speed up "Get" and "FindFile"
benchmark with in-memory tmpfs, could have 4% improvement under "random read" and 2% improvement under "read while writing"
benchmark command:
./db_bench --db=/mnt/db/rocksdb --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --block_size=4096 --cache_size=17179869184 --cache_numshardbits=6 --compression_type=none --compression_ratio=1 --min_level_to_compress=-1 --disable_seek_compaction=1 --hard_rate_limit=2 --write_buffer_size=134217728 --max_write_buffer_number=2 --level0_file_num_compaction_trigger=8 --target_file_size_base=33554432 --max_bytes_for_level_base=1073741824 --disable_wal=0 --sync=0 --disable_data_sync=1 --verify_checksum=1 --delete_obsolete_files_period_micros=314572800 --max_grandparent_overlap_factor=10 --max_background_compactions=4 --max_background_flushes=0 --level0_slowdown_writes_trigger=16 --level0_stop_writes_trigger=24 --statistics=0 --stats_per_interval=0 --stats_interval=1048576 --histogram=0 --use_plain_table=1 --open_files=-1 --mmap_read=1 --mmap_write=0 --memtablerep=prefix_hash --bloom_bits=10 --bloom_locality=1 --perf_level=0 --benchmarks=readwhilewriting,readwhilewriting,readwhilewriting --use_existing_db=1 --num=52428800 --threads=1 —writes_per_second=81920
Read Random:
From 1.8363 ms/op, improve to 1.7587 ms/op.
Read while writing:
From 2.985 ms/op, improve to 2.924 ms/op.
Test Plan:
make all check
Reviewers: ljin, haobo, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, igor
Differential Revision: https://reviews.facebook.net/D19419
2014-07-10 05:14:39 +00:00
|
|
|
}
|
2011-06-22 18:45:39 +00:00
|
|
|
|
|
|
|
bool Overlaps(const char* smallest, const char* largest) {
|
|
|
|
InternalKeyComparator cmp(BytewiseComparator());
|
2013-03-01 02:04:58 +00:00
|
|
|
Slice s(smallest != nullptr ? smallest : "");
|
|
|
|
Slice l(largest != nullptr ? largest : "");
|
2014-07-11 19:52:41 +00:00
|
|
|
return SomeFileOverlapsRange(cmp, disjoint_sorted_files_, file_level_,
|
2013-03-01 02:04:58 +00:00
|
|
|
(smallest != nullptr ? &s : nullptr),
|
|
|
|
(largest != nullptr ? &l : nullptr));
|
2011-06-22 18:45:39 +00:00
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2015-03-17 21:08:00 +00:00
|
|
|
TEST_F(FindLevelFileTest, LevelEmpty) {
|
create compressed_levels_ in Version, allocate its space using arena. Make Version::Get, Version::FindFile faster
Summary:
Define CompressedFileMetaData that just contains fd, smallest_slice, largest_slice. Create compressed_levels_ in Version, the space is allocated using arena
Thus increase the file meta data locality, speed up "Get" and "FindFile"
benchmark with in-memory tmpfs, could have 4% improvement under "random read" and 2% improvement under "read while writing"
benchmark command:
./db_bench --db=/mnt/db/rocksdb --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --block_size=4096 --cache_size=17179869184 --cache_numshardbits=6 --compression_type=none --compression_ratio=1 --min_level_to_compress=-1 --disable_seek_compaction=1 --hard_rate_limit=2 --write_buffer_size=134217728 --max_write_buffer_number=2 --level0_file_num_compaction_trigger=8 --target_file_size_base=33554432 --max_bytes_for_level_base=1073741824 --disable_wal=0 --sync=0 --disable_data_sync=1 --verify_checksum=1 --delete_obsolete_files_period_micros=314572800 --max_grandparent_overlap_factor=10 --max_background_compactions=4 --max_background_flushes=0 --level0_slowdown_writes_trigger=16 --level0_stop_writes_trigger=24 --statistics=0 --stats_per_interval=0 --stats_interval=1048576 --histogram=0 --use_plain_table=1 --open_files=-1 --mmap_read=1 --mmap_write=0 --memtablerep=prefix_hash --bloom_bits=10 --bloom_locality=1 --perf_level=0 --benchmarks=readwhilewriting,readwhilewriting,readwhilewriting --use_existing_db=1 --num=52428800 --threads=1 —writes_per_second=81920
Read Random:
From 1.8363 ms/op, improve to 1.7587 ms/op.
Read while writing:
From 2.985 ms/op, improve to 2.924 ms/op.
Test Plan:
make all check
Reviewers: ljin, haobo, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, igor
Differential Revision: https://reviews.facebook.net/D19419
2014-07-10 05:14:39 +00:00
|
|
|
LevelFileInit(0);
|
|
|
|
|
2011-06-22 18:45:39 +00:00
|
|
|
ASSERT_EQ(0, Find("foo"));
|
2022-11-02 21:34:24 +00:00
|
|
|
ASSERT_TRUE(!Overlaps("a", "z"));
|
|
|
|
ASSERT_TRUE(!Overlaps(nullptr, "z"));
|
|
|
|
ASSERT_TRUE(!Overlaps("a", nullptr));
|
|
|
|
ASSERT_TRUE(!Overlaps(nullptr, nullptr));
|
2011-06-22 18:45:39 +00:00
|
|
|
}
|
|
|
|
|
2015-03-17 21:08:00 +00:00
|
|
|
TEST_F(FindLevelFileTest, LevelSingle) {
|
create compressed_levels_ in Version, allocate its space using arena. Make Version::Get, Version::FindFile faster
Summary:
Define CompressedFileMetaData that just contains fd, smallest_slice, largest_slice. Create compressed_levels_ in Version, the space is allocated using arena
Thus increase the file meta data locality, speed up "Get" and "FindFile"
benchmark with in-memory tmpfs, could have 4% improvement under "random read" and 2% improvement under "read while writing"
benchmark command:
./db_bench --db=/mnt/db/rocksdb --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --block_size=4096 --cache_size=17179869184 --cache_numshardbits=6 --compression_type=none --compression_ratio=1 --min_level_to_compress=-1 --disable_seek_compaction=1 --hard_rate_limit=2 --write_buffer_size=134217728 --max_write_buffer_number=2 --level0_file_num_compaction_trigger=8 --target_file_size_base=33554432 --max_bytes_for_level_base=1073741824 --disable_wal=0 --sync=0 --disable_data_sync=1 --verify_checksum=1 --delete_obsolete_files_period_micros=314572800 --max_grandparent_overlap_factor=10 --max_background_compactions=4 --max_background_flushes=0 --level0_slowdown_writes_trigger=16 --level0_stop_writes_trigger=24 --statistics=0 --stats_per_interval=0 --stats_interval=1048576 --histogram=0 --use_plain_table=1 --open_files=-1 --mmap_read=1 --mmap_write=0 --memtablerep=prefix_hash --bloom_bits=10 --bloom_locality=1 --perf_level=0 --benchmarks=readwhilewriting,readwhilewriting,readwhilewriting --use_existing_db=1 --num=52428800 --threads=1 —writes_per_second=81920
Read Random:
From 1.8363 ms/op, improve to 1.7587 ms/op.
Read while writing:
From 2.985 ms/op, improve to 2.924 ms/op.
Test Plan:
make all check
Reviewers: ljin, haobo, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, igor
Differential Revision: https://reviews.facebook.net/D19419
2014-07-10 05:14:39 +00:00
|
|
|
LevelFileInit(1);
|
|
|
|
|
2011-06-22 18:45:39 +00:00
|
|
|
Add("p", "q");
|
|
|
|
ASSERT_EQ(0, Find("a"));
|
|
|
|
ASSERT_EQ(0, Find("p"));
|
|
|
|
ASSERT_EQ(0, Find("p1"));
|
|
|
|
ASSERT_EQ(0, Find("q"));
|
|
|
|
ASSERT_EQ(1, Find("q1"));
|
|
|
|
ASSERT_EQ(1, Find("z"));
|
|
|
|
|
2022-11-02 21:34:24 +00:00
|
|
|
ASSERT_TRUE(!Overlaps("a", "b"));
|
|
|
|
ASSERT_TRUE(!Overlaps("z1", "z2"));
|
2011-06-22 18:45:39 +00:00
|
|
|
ASSERT_TRUE(Overlaps("a", "p"));
|
|
|
|
ASSERT_TRUE(Overlaps("a", "q"));
|
|
|
|
ASSERT_TRUE(Overlaps("a", "z"));
|
|
|
|
ASSERT_TRUE(Overlaps("p", "p1"));
|
|
|
|
ASSERT_TRUE(Overlaps("p", "q"));
|
|
|
|
ASSERT_TRUE(Overlaps("p", "z"));
|
|
|
|
ASSERT_TRUE(Overlaps("p1", "p2"));
|
|
|
|
ASSERT_TRUE(Overlaps("p1", "z"));
|
|
|
|
ASSERT_TRUE(Overlaps("q", "q"));
|
|
|
|
ASSERT_TRUE(Overlaps("q", "q1"));
|
2011-10-05 23:30:28 +00:00
|
|
|
|
2022-11-02 21:34:24 +00:00
|
|
|
ASSERT_TRUE(!Overlaps(nullptr, "j"));
|
|
|
|
ASSERT_TRUE(!Overlaps("r", nullptr));
|
2013-03-01 02:04:58 +00:00
|
|
|
ASSERT_TRUE(Overlaps(nullptr, "p"));
|
|
|
|
ASSERT_TRUE(Overlaps(nullptr, "p1"));
|
|
|
|
ASSERT_TRUE(Overlaps("q", nullptr));
|
|
|
|
ASSERT_TRUE(Overlaps(nullptr, nullptr));
|
2011-06-22 18:45:39 +00:00
|
|
|
}
|
|
|
|
|
2015-03-17 21:08:00 +00:00
|
|
|
TEST_F(FindLevelFileTest, LevelMultiple) {
|
create compressed_levels_ in Version, allocate its space using arena. Make Version::Get, Version::FindFile faster
Summary:
Define CompressedFileMetaData that just contains fd, smallest_slice, largest_slice. Create compressed_levels_ in Version, the space is allocated using arena
Thus increase the file meta data locality, speed up "Get" and "FindFile"
benchmark with in-memory tmpfs, could have 4% improvement under "random read" and 2% improvement under "read while writing"
benchmark command:
./db_bench --db=/mnt/db/rocksdb --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --block_size=4096 --cache_size=17179869184 --cache_numshardbits=6 --compression_type=none --compression_ratio=1 --min_level_to_compress=-1 --disable_seek_compaction=1 --hard_rate_limit=2 --write_buffer_size=134217728 --max_write_buffer_number=2 --level0_file_num_compaction_trigger=8 --target_file_size_base=33554432 --max_bytes_for_level_base=1073741824 --disable_wal=0 --sync=0 --disable_data_sync=1 --verify_checksum=1 --delete_obsolete_files_period_micros=314572800 --max_grandparent_overlap_factor=10 --max_background_compactions=4 --max_background_flushes=0 --level0_slowdown_writes_trigger=16 --level0_stop_writes_trigger=24 --statistics=0 --stats_per_interval=0 --stats_interval=1048576 --histogram=0 --use_plain_table=1 --open_files=-1 --mmap_read=1 --mmap_write=0 --memtablerep=prefix_hash --bloom_bits=10 --bloom_locality=1 --perf_level=0 --benchmarks=readwhilewriting,readwhilewriting,readwhilewriting --use_existing_db=1 --num=52428800 --threads=1 —writes_per_second=81920
Read Random:
From 1.8363 ms/op, improve to 1.7587 ms/op.
Read while writing:
From 2.985 ms/op, improve to 2.924 ms/op.
Test Plan:
make all check
Reviewers: ljin, haobo, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, igor
Differential Revision: https://reviews.facebook.net/D19419
2014-07-10 05:14:39 +00:00
|
|
|
LevelFileInit(4);
|
2011-06-22 18:45:39 +00:00
|
|
|
|
|
|
|
Add("150", "200");
|
|
|
|
Add("200", "250");
|
|
|
|
Add("300", "350");
|
|
|
|
Add("400", "450");
|
|
|
|
ASSERT_EQ(0, Find("100"));
|
|
|
|
ASSERT_EQ(0, Find("150"));
|
|
|
|
ASSERT_EQ(0, Find("151"));
|
|
|
|
ASSERT_EQ(0, Find("199"));
|
|
|
|
ASSERT_EQ(0, Find("200"));
|
|
|
|
ASSERT_EQ(1, Find("201"));
|
|
|
|
ASSERT_EQ(1, Find("249"));
|
|
|
|
ASSERT_EQ(1, Find("250"));
|
|
|
|
ASSERT_EQ(2, Find("251"));
|
|
|
|
ASSERT_EQ(2, Find("299"));
|
|
|
|
ASSERT_EQ(2, Find("300"));
|
|
|
|
ASSERT_EQ(2, Find("349"));
|
|
|
|
ASSERT_EQ(2, Find("350"));
|
|
|
|
ASSERT_EQ(3, Find("351"));
|
|
|
|
ASSERT_EQ(3, Find("400"));
|
|
|
|
ASSERT_EQ(3, Find("450"));
|
|
|
|
ASSERT_EQ(4, Find("451"));
|
|
|
|
|
2022-11-02 21:34:24 +00:00
|
|
|
ASSERT_TRUE(!Overlaps("100", "149"));
|
|
|
|
ASSERT_TRUE(!Overlaps("251", "299"));
|
|
|
|
ASSERT_TRUE(!Overlaps("451", "500"));
|
|
|
|
ASSERT_TRUE(!Overlaps("351", "399"));
|
2011-06-22 18:45:39 +00:00
|
|
|
|
|
|
|
ASSERT_TRUE(Overlaps("100", "150"));
|
|
|
|
ASSERT_TRUE(Overlaps("100", "200"));
|
|
|
|
ASSERT_TRUE(Overlaps("100", "300"));
|
|
|
|
ASSERT_TRUE(Overlaps("100", "400"));
|
|
|
|
ASSERT_TRUE(Overlaps("100", "500"));
|
|
|
|
ASSERT_TRUE(Overlaps("375", "400"));
|
|
|
|
ASSERT_TRUE(Overlaps("450", "450"));
|
|
|
|
ASSERT_TRUE(Overlaps("450", "500"));
|
|
|
|
}
|
|
|
|
|
2015-03-17 21:08:00 +00:00
|
|
|
TEST_F(FindLevelFileTest, LevelMultipleNullBoundaries) {
|
create compressed_levels_ in Version, allocate its space using arena. Make Version::Get, Version::FindFile faster
Summary:
Define CompressedFileMetaData that just contains fd, smallest_slice, largest_slice. Create compressed_levels_ in Version, the space is allocated using arena
Thus increase the file meta data locality, speed up "Get" and "FindFile"
benchmark with in-memory tmpfs, could have 4% improvement under "random read" and 2% improvement under "read while writing"
benchmark command:
./db_bench --db=/mnt/db/rocksdb --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --block_size=4096 --cache_size=17179869184 --cache_numshardbits=6 --compression_type=none --compression_ratio=1 --min_level_to_compress=-1 --disable_seek_compaction=1 --hard_rate_limit=2 --write_buffer_size=134217728 --max_write_buffer_number=2 --level0_file_num_compaction_trigger=8 --target_file_size_base=33554432 --max_bytes_for_level_base=1073741824 --disable_wal=0 --sync=0 --disable_data_sync=1 --verify_checksum=1 --delete_obsolete_files_period_micros=314572800 --max_grandparent_overlap_factor=10 --max_background_compactions=4 --max_background_flushes=0 --level0_slowdown_writes_trigger=16 --level0_stop_writes_trigger=24 --statistics=0 --stats_per_interval=0 --stats_interval=1048576 --histogram=0 --use_plain_table=1 --open_files=-1 --mmap_read=1 --mmap_write=0 --memtablerep=prefix_hash --bloom_bits=10 --bloom_locality=1 --perf_level=0 --benchmarks=readwhilewriting,readwhilewriting,readwhilewriting --use_existing_db=1 --num=52428800 --threads=1 —writes_per_second=81920
Read Random:
From 1.8363 ms/op, improve to 1.7587 ms/op.
Read while writing:
From 2.985 ms/op, improve to 2.924 ms/op.
Test Plan:
make all check
Reviewers: ljin, haobo, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, igor
Differential Revision: https://reviews.facebook.net/D19419
2014-07-10 05:14:39 +00:00
|
|
|
LevelFileInit(4);
|
|
|
|
|
2011-10-05 23:30:28 +00:00
|
|
|
Add("150", "200");
|
|
|
|
Add("200", "250");
|
|
|
|
Add("300", "350");
|
|
|
|
Add("400", "450");
|
2022-11-02 21:34:24 +00:00
|
|
|
ASSERT_TRUE(!Overlaps(nullptr, "149"));
|
|
|
|
ASSERT_TRUE(!Overlaps("451", nullptr));
|
2013-03-01 02:04:58 +00:00
|
|
|
ASSERT_TRUE(Overlaps(nullptr, nullptr));
|
|
|
|
ASSERT_TRUE(Overlaps(nullptr, "150"));
|
|
|
|
ASSERT_TRUE(Overlaps(nullptr, "199"));
|
|
|
|
ASSERT_TRUE(Overlaps(nullptr, "200"));
|
|
|
|
ASSERT_TRUE(Overlaps(nullptr, "201"));
|
|
|
|
ASSERT_TRUE(Overlaps(nullptr, "400"));
|
|
|
|
ASSERT_TRUE(Overlaps(nullptr, "800"));
|
|
|
|
ASSERT_TRUE(Overlaps("100", nullptr));
|
|
|
|
ASSERT_TRUE(Overlaps("200", nullptr));
|
|
|
|
ASSERT_TRUE(Overlaps("449", nullptr));
|
|
|
|
ASSERT_TRUE(Overlaps("450", nullptr));
|
2011-10-05 23:30:28 +00:00
|
|
|
}
|
|
|
|
|
2015-03-17 21:08:00 +00:00
|
|
|
TEST_F(FindLevelFileTest, LevelOverlapSequenceChecks) {
|
create compressed_levels_ in Version, allocate its space using arena. Make Version::Get, Version::FindFile faster
Summary:
Define CompressedFileMetaData that just contains fd, smallest_slice, largest_slice. Create compressed_levels_ in Version, the space is allocated using arena
Thus increase the file meta data locality, speed up "Get" and "FindFile"
benchmark with in-memory tmpfs, could have 4% improvement under "random read" and 2% improvement under "read while writing"
benchmark command:
./db_bench --db=/mnt/db/rocksdb --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --block_size=4096 --cache_size=17179869184 --cache_numshardbits=6 --compression_type=none --compression_ratio=1 --min_level_to_compress=-1 --disable_seek_compaction=1 --hard_rate_limit=2 --write_buffer_size=134217728 --max_write_buffer_number=2 --level0_file_num_compaction_trigger=8 --target_file_size_base=33554432 --max_bytes_for_level_base=1073741824 --disable_wal=0 --sync=0 --disable_data_sync=1 --verify_checksum=1 --delete_obsolete_files_period_micros=314572800 --max_grandparent_overlap_factor=10 --max_background_compactions=4 --max_background_flushes=0 --level0_slowdown_writes_trigger=16 --level0_stop_writes_trigger=24 --statistics=0 --stats_per_interval=0 --stats_interval=1048576 --histogram=0 --use_plain_table=1 --open_files=-1 --mmap_read=1 --mmap_write=0 --memtablerep=prefix_hash --bloom_bits=10 --bloom_locality=1 --perf_level=0 --benchmarks=readwhilewriting,readwhilewriting,readwhilewriting --use_existing_db=1 --num=52428800 --threads=1 —writes_per_second=81920
Read Random:
From 1.8363 ms/op, improve to 1.7587 ms/op.
Read while writing:
From 2.985 ms/op, improve to 2.924 ms/op.
Test Plan:
make all check
Reviewers: ljin, haobo, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, igor
Differential Revision: https://reviews.facebook.net/D19419
2014-07-10 05:14:39 +00:00
|
|
|
LevelFileInit(1);
|
|
|
|
|
2011-07-15 00:20:57 +00:00
|
|
|
Add("200", "200", 5000, 3000);
|
2022-11-02 21:34:24 +00:00
|
|
|
ASSERT_TRUE(!Overlaps("199", "199"));
|
|
|
|
ASSERT_TRUE(!Overlaps("201", "300"));
|
2011-07-15 00:20:57 +00:00
|
|
|
ASSERT_TRUE(Overlaps("200", "200"));
|
|
|
|
ASSERT_TRUE(Overlaps("190", "200"));
|
|
|
|
ASSERT_TRUE(Overlaps("200", "210"));
|
|
|
|
}
|
|
|
|
|
2015-03-17 21:08:00 +00:00
|
|
|
TEST_F(FindLevelFileTest, LevelOverlappingFiles) {
|
create compressed_levels_ in Version, allocate its space using arena. Make Version::Get, Version::FindFile faster
Summary:
Define CompressedFileMetaData that just contains fd, smallest_slice, largest_slice. Create compressed_levels_ in Version, the space is allocated using arena
Thus increase the file meta data locality, speed up "Get" and "FindFile"
benchmark with in-memory tmpfs, could have 4% improvement under "random read" and 2% improvement under "read while writing"
benchmark command:
./db_bench --db=/mnt/db/rocksdb --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --block_size=4096 --cache_size=17179869184 --cache_numshardbits=6 --compression_type=none --compression_ratio=1 --min_level_to_compress=-1 --disable_seek_compaction=1 --hard_rate_limit=2 --write_buffer_size=134217728 --max_write_buffer_number=2 --level0_file_num_compaction_trigger=8 --target_file_size_base=33554432 --max_bytes_for_level_base=1073741824 --disable_wal=0 --sync=0 --disable_data_sync=1 --verify_checksum=1 --delete_obsolete_files_period_micros=314572800 --max_grandparent_overlap_factor=10 --max_background_compactions=4 --max_background_flushes=0 --level0_slowdown_writes_trigger=16 --level0_stop_writes_trigger=24 --statistics=0 --stats_per_interval=0 --stats_interval=1048576 --histogram=0 --use_plain_table=1 --open_files=-1 --mmap_read=1 --mmap_write=0 --memtablerep=prefix_hash --bloom_bits=10 --bloom_locality=1 --perf_level=0 --benchmarks=readwhilewriting,readwhilewriting,readwhilewriting --use_existing_db=1 --num=52428800 --threads=1 —writes_per_second=81920
Read Random:
From 1.8363 ms/op, improve to 1.7587 ms/op.
Read while writing:
From 2.985 ms/op, improve to 2.924 ms/op.
Test Plan:
make all check
Reviewers: ljin, haobo, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, igor
Differential Revision: https://reviews.facebook.net/D19419
2014-07-10 05:14:39 +00:00
|
|
|
LevelFileInit(2);
|
|
|
|
|
2011-10-05 23:30:28 +00:00
|
|
|
Add("150", "600");
|
|
|
|
Add("400", "500");
|
|
|
|
disjoint_sorted_files_ = false;
|
2022-11-02 21:34:24 +00:00
|
|
|
ASSERT_TRUE(!Overlaps("100", "149"));
|
|
|
|
ASSERT_TRUE(!Overlaps("601", "700"));
|
2011-10-05 23:30:28 +00:00
|
|
|
ASSERT_TRUE(Overlaps("100", "150"));
|
|
|
|
ASSERT_TRUE(Overlaps("100", "200"));
|
|
|
|
ASSERT_TRUE(Overlaps("100", "300"));
|
|
|
|
ASSERT_TRUE(Overlaps("100", "400"));
|
|
|
|
ASSERT_TRUE(Overlaps("100", "500"));
|
|
|
|
ASSERT_TRUE(Overlaps("375", "400"));
|
|
|
|
ASSERT_TRUE(Overlaps("450", "450"));
|
|
|
|
ASSERT_TRUE(Overlaps("450", "500"));
|
|
|
|
ASSERT_TRUE(Overlaps("450", "700"));
|
|
|
|
ASSERT_TRUE(Overlaps("600", "700"));
|
|
|
|
}
|
|
|
|
|
2018-12-13 23:10:16 +00:00
|
|
|
class VersionSetTestBase {
|
2018-06-28 19:16:10 +00:00
|
|
|
public:
|
2018-12-13 23:10:16 +00:00
|
|
|
const static std::string kColumnFamilyName1;
|
|
|
|
const static std::string kColumnFamilyName2;
|
|
|
|
const static std::string kColumnFamilyName3;
|
Best-effort recovery support for atomic flush (#12406)
Summary:
This PR updates `VersionEditHandlerPointInTime` to recover all or none of the updates in an AtomicGroup. This makes best-effort recovery properly handle atomic flushes during recovery, so the features are now allowed to both be enabled at once.
The new logic requires that AtomicGroups do not contain column family additions or removals. AtomicGroups are currently written for atomic flush, which does not include such edits.
Column family additions or removals are recovered independently of AtomicGroups. The new logic needs to be aware of removal, though, so that a dropped CF does not prevent completion of an AtomicGroup recovery.
The new logic treats each AtomicGroup as if it contains updates for all existing column families, even though it is possible to create AtomicGroups that only affect a subset of column families. This simplifies the logic at the expense of recovering less data in certain edge case scenarios.
The usage of `MaybeCreateVersion()` is pretty tricky. The goal is to create a barrier at the start of an AtomicGroup such that all valid states up to that point will be applied to `versions_`. Here is a summary.
- `MaybeCreateVersion(..., false)` creates a `Version` on a negative edge trigger (transition from valid to invalid). It was previously called when applying each update. Now, it is only called when applying non-AtomicGroup updates.
- `MaybeCreateVersion(..., true)` creates a `Version` on a positive level trigger (valid state). It was previously called only at the end of iteration. Now, it is additionally called before processing an AtomicGroup.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12406
Reviewed By: jaykorean, cbi42
Differential Revision: D54494904
Pulled By: ajkr
fbshipit-source-id: 0114a9fe1d04b471d086dcab5978ea8a3a56ad52
2024-03-06 22:40:40 +00:00
|
|
|
const static int kNumColumnFamilies = 4;
|
2019-06-04 17:51:22 +00:00
|
|
|
int num_initial_edits_;
|
2018-12-13 23:10:16 +00:00
|
|
|
|
2020-03-21 02:17:54 +00:00
|
|
|
explicit VersionSetTestBase(const std::string& name)
|
Fix many tests to run with MEM_ENV and ENCRYPTED_ENV; Introduce a MemoryFileSystem class (#7566)
Summary:
This PR does a few things:
1. The MockFileSystem class was split out from the MockEnv. This change would theoretically allow a MockFileSystem to be used by other Environments as well (if we created a means of constructing one). The MockFileSystem implements a FileSystem in its entirety and does not rely on any Wrapper implementation.
2. Make the RocksDB test suite work when MOCK_ENV=1 and ENCRYPTED_ENV=1 are set. To accomplish this, a few things were needed:
- The tests that tried to use the "wrong" environment (Env::Default() instead of env_) were updated
- The MockFileSystem was changed to support the features it was missing or mishandled (such as recursively deleting files in a directory or supporting renaming of a directory).
3. Updated the test framework to have a ROCKSDB_GTEST_SKIP macro. This can be used to flag tests that are skipped. Currently, this defaults to doing nothing (marks the test as SUCCESS) but will mark the tests as SKIPPED when RocksDB is upgraded to a version of gtest that supports this (gtest-1.10).
I have run a full "make check" with MEM_ENV, ENCRYPTED_ENV, both, and neither under both MacOS and RedHat. A few tests were disabled/skipped for the MEM/ENCRYPTED cases. The error_handler_fs_test fails/hangs for MEM_ENV (presumably a timing problem) and I will introduce another PR/issue to track that problem. (I will also push a change to disable those tests soon). There is one more test in DBTest2 that also fails which I need to investigate or skip before this PR is merged.
Theoretically, this PR should also allow the test suite to run against an Env loaded from the registry, though I do not have one to try it with currently.
Finally, once this is accepted, it would be nice if there was a CircleCI job to run these tests on a checkin so this effort does not become stale. I do not know how to do that, so if someone could write that job, it would be appreciated :)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7566
Reviewed By: zhichao-cao
Differential Revision: D24408980
Pulled By: jay-zhuang
fbshipit-source-id: 911b1554a4d0da06fd51feca0c090a4abdcb4a5f
2020-10-27 17:31:34 +00:00
|
|
|
: env_(nullptr),
|
2020-03-21 02:17:54 +00:00
|
|
|
dbname_(test::PerThreadDBPath(name)),
|
|
|
|
options_(),
|
|
|
|
db_options_(options_),
|
|
|
|
cf_options_(options_),
|
2021-06-16 23:50:43 +00:00
|
|
|
immutable_options_(db_options_, cf_options_),
|
2018-06-28 19:16:10 +00:00
|
|
|
mutable_cf_options_(cf_options_),
|
|
|
|
table_cache_(NewLRUCache(50000, 16)),
|
|
|
|
write_buffer_manager_(db_options_.db_write_buffer_size),
|
|
|
|
shutting_down_(false),
|
Best-effort recovery support for atomic flush (#12406)
Summary:
This PR updates `VersionEditHandlerPointInTime` to recover all or none of the updates in an AtomicGroup. This makes best-effort recovery properly handle atomic flushes during recovery, so the features are now allowed to both be enabled at once.
The new logic requires that AtomicGroups do not contain column family additions or removals. AtomicGroups are currently written for atomic flush, which does not include such edits.
Column family additions or removals are recovered independently of AtomicGroups. The new logic needs to be aware of removal, though, so that a dropped CF does not prevent completion of an AtomicGroup recovery.
The new logic treats each AtomicGroup as if it contains updates for all existing column families, even though it is possible to create AtomicGroups that only affect a subset of column families. This simplifies the logic at the expense of recovering less data in certain edge case scenarios.
The usage of `MaybeCreateVersion()` is pretty tricky. The goal is to create a barrier at the start of an AtomicGroup such that all valid states up to that point will be applied to `versions_`. Here is a summary.
- `MaybeCreateVersion(..., false)` creates a `Version` on a negative edge trigger (transition from valid to invalid). It was previously called when applying each update. Now, it is only called when applying non-AtomicGroup updates.
- `MaybeCreateVersion(..., true)` creates a `Version` on a positive level trigger (valid state). It was previously called only at the end of iteration. Now, it is additionally called before processing an AtomicGroup.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12406
Reviewed By: jaykorean, cbi42
Differential Revision: D54494904
Pulled By: ajkr
fbshipit-source-id: 0114a9fe1d04b471d086dcab5978ea8a3a56ad52
2024-03-06 22:40:40 +00:00
|
|
|
table_factory_(std::make_shared<mock::MockTableFactory>()) {
|
2021-06-15 10:42:52 +00:00
|
|
|
EXPECT_OK(test::CreateEnvFromSystem(ConfigOptions(), &env_, &env_guard_));
|
|
|
|
if (env_ == Env::Default() && getenv("MEM_ENV")) {
|
Fix many tests to run with MEM_ENV and ENCRYPTED_ENV; Introduce a MemoryFileSystem class (#7566)
Summary:
This PR does a few things:
1. The MockFileSystem class was split out from the MockEnv. This change would theoretically allow a MockFileSystem to be used by other Environments as well (if we created a means of constructing one). The MockFileSystem implements a FileSystem in its entirety and does not rely on any Wrapper implementation.
2. Make the RocksDB test suite work when MOCK_ENV=1 and ENCRYPTED_ENV=1 are set. To accomplish this, a few things were needed:
- The tests that tried to use the "wrong" environment (Env::Default() instead of env_) were updated
- The MockFileSystem was changed to support the features it was missing or mishandled (such as recursively deleting files in a directory or supporting renaming of a directory).
3. Updated the test framework to have a ROCKSDB_GTEST_SKIP macro. This can be used to flag tests that are skipped. Currently, this defaults to doing nothing (marks the test as SUCCESS) but will mark the tests as SKIPPED when RocksDB is upgraded to a version of gtest that supports this (gtest-1.10).
I have run a full "make check" with MEM_ENV, ENCRYPTED_ENV, both, and neither under both MacOS and RedHat. A few tests were disabled/skipped for the MEM/ENCRYPTED cases. The error_handler_fs_test fails/hangs for MEM_ENV (presumably a timing problem) and I will introduce another PR/issue to track that problem. (I will also push a change to disable those tests soon). There is one more test in DBTest2 that also fails which I need to investigate or skip before this PR is merged.
Theoretically, this PR should also allow the test suite to run against an Env loaded from the registry, though I do not have one to try it with currently.
Finally, once this is accepted, it would be nice if there was a CircleCI job to run these tests on a checkin so this effort does not become stale. I do not know how to do that, so if someone could write that job, it would be appreciated :)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7566
Reviewed By: zhichao-cao
Differential Revision: D24408980
Pulled By: jay-zhuang
fbshipit-source-id: 911b1554a4d0da06fd51feca0c090a4abdcb4a5f
2020-10-27 17:31:34 +00:00
|
|
|
env_guard_.reset(NewMemEnv(Env::Default()));
|
|
|
|
env_ = env_guard_.get();
|
2020-03-21 02:17:54 +00:00
|
|
|
}
|
Fix many tests to run with MEM_ENV and ENCRYPTED_ENV; Introduce a MemoryFileSystem class (#7566)
Summary:
This PR does a few things:
1. The MockFileSystem class was split out from the MockEnv. This change would theoretically allow a MockFileSystem to be used by other Environments as well (if we created a means of constructing one). The MockFileSystem implements a FileSystem in its entirety and does not rely on any Wrapper implementation.
2. Make the RocksDB test suite work when MOCK_ENV=1 and ENCRYPTED_ENV=1 are set. To accomplish this, a few things were needed:
- The tests that tried to use the "wrong" environment (Env::Default() instead of env_) were updated
- The MockFileSystem was changed to support the features it was missing or mishandled (such as recursively deleting files in a directory or supporting renaming of a directory).
3. Updated the test framework to have a ROCKSDB_GTEST_SKIP macro. This can be used to flag tests that are skipped. Currently, this defaults to doing nothing (marks the test as SUCCESS) but will mark the tests as SKIPPED when RocksDB is upgraded to a version of gtest that supports this (gtest-1.10).
I have run a full "make check" with MEM_ENV, ENCRYPTED_ENV, both, and neither under both MacOS and RedHat. A few tests were disabled/skipped for the MEM/ENCRYPTED cases. The error_handler_fs_test fails/hangs for MEM_ENV (presumably a timing problem) and I will introduce another PR/issue to track that problem. (I will also push a change to disable those tests soon). There is one more test in DBTest2 that also fails which I need to investigate or skip before this PR is merged.
Theoretically, this PR should also allow the test suite to run against an Env loaded from the registry, though I do not have one to try it with currently.
Finally, once this is accepted, it would be nice if there was a CircleCI job to run these tests on a checkin so this effort does not become stale. I do not know how to do that, so if someone could write that job, it would be appreciated :)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7566
Reviewed By: zhichao-cao
Differential Revision: D24408980
Pulled By: jay-zhuang
fbshipit-source-id: 911b1554a4d0da06fd51feca0c090a4abdcb4a5f
2020-10-27 17:31:34 +00:00
|
|
|
EXPECT_NE(nullptr, env_);
|
2020-03-21 02:17:54 +00:00
|
|
|
|
Fix many tests to run with MEM_ENV and ENCRYPTED_ENV; Introduce a MemoryFileSystem class (#7566)
Summary:
This PR does a few things:
1. The MockFileSystem class was split out from the MockEnv. This change would theoretically allow a MockFileSystem to be used by other Environments as well (if we created a means of constructing one). The MockFileSystem implements a FileSystem in its entirety and does not rely on any Wrapper implementation.
2. Make the RocksDB test suite work when MOCK_ENV=1 and ENCRYPTED_ENV=1 are set. To accomplish this, a few things were needed:
- The tests that tried to use the "wrong" environment (Env::Default() instead of env_) were updated
- The MockFileSystem was changed to support the features it was missing or mishandled (such as recursively deleting files in a directory or supporting renaming of a directory).
3. Updated the test framework to have a ROCKSDB_GTEST_SKIP macro. This can be used to flag tests that are skipped. Currently, this defaults to doing nothing (marks the test as SUCCESS) but will mark the tests as SKIPPED when RocksDB is upgraded to a version of gtest that supports this (gtest-1.10).
I have run a full "make check" with MEM_ENV, ENCRYPTED_ENV, both, and neither under both MacOS and RedHat. A few tests were disabled/skipped for the MEM/ENCRYPTED cases. The error_handler_fs_test fails/hangs for MEM_ENV (presumably a timing problem) and I will introduce another PR/issue to track that problem. (I will also push a change to disable those tests soon). There is one more test in DBTest2 that also fails which I need to investigate or skip before this PR is merged.
Theoretically, this PR should also allow the test suite to run against an Env loaded from the registry, though I do not have one to try it with currently.
Finally, once this is accepted, it would be nice if there was a CircleCI job to run these tests on a checkin so this effort does not become stale. I do not know how to do that, so if someone could write that job, it would be appreciated :)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7566
Reviewed By: zhichao-cao
Differential Revision: D24408980
Pulled By: jay-zhuang
fbshipit-source-id: 911b1554a4d0da06fd51feca0c090a4abdcb4a5f
2020-10-27 17:31:34 +00:00
|
|
|
fs_ = env_->GetFileSystem();
|
|
|
|
EXPECT_OK(fs_->CreateDirIfMissing(dbname_, IOOptions(), nullptr));
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
|
Fix many tests to run with MEM_ENV and ENCRYPTED_ENV; Introduce a MemoryFileSystem class (#7566)
Summary:
This PR does a few things:
1. The MockFileSystem class was split out from the MockEnv. This change would theoretically allow a MockFileSystem to be used by other Environments as well (if we created a means of constructing one). The MockFileSystem implements a FileSystem in its entirety and does not rely on any Wrapper implementation.
2. Make the RocksDB test suite work when MOCK_ENV=1 and ENCRYPTED_ENV=1 are set. To accomplish this, a few things were needed:
- The tests that tried to use the "wrong" environment (Env::Default() instead of env_) were updated
- The MockFileSystem was changed to support the features it was missing or mishandled (such as recursively deleting files in a directory or supporting renaming of a directory).
3. Updated the test framework to have a ROCKSDB_GTEST_SKIP macro. This can be used to flag tests that are skipped. Currently, this defaults to doing nothing (marks the test as SUCCESS) but will mark the tests as SKIPPED when RocksDB is upgraded to a version of gtest that supports this (gtest-1.10).
I have run a full "make check" with MEM_ENV, ENCRYPTED_ENV, both, and neither under both MacOS and RedHat. A few tests were disabled/skipped for the MEM/ENCRYPTED cases. The error_handler_fs_test fails/hangs for MEM_ENV (presumably a timing problem) and I will introduce another PR/issue to track that problem. (I will also push a change to disable those tests soon). There is one more test in DBTest2 that also fails which I need to investigate or skip before this PR is merged.
Theoretically, this PR should also allow the test suite to run against an Env loaded from the registry, though I do not have one to try it with currently.
Finally, once this is accepted, it would be nice if there was a CircleCI job to run these tests on a checkin so this effort does not become stale. I do not know how to do that, so if someone could write that job, it would be appreciated :)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7566
Reviewed By: zhichao-cao
Differential Revision: D24408980
Pulled By: jay-zhuang
fbshipit-source-id: 911b1554a4d0da06fd51feca0c090a4abdcb4a5f
2020-10-27 17:31:34 +00:00
|
|
|
options_.env = env_;
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
db_options_.env = env_;
|
|
|
|
db_options_.fs = fs_;
|
2021-06-16 23:50:43 +00:00
|
|
|
immutable_options_.env = env_;
|
|
|
|
immutable_options_.fs = fs_;
|
|
|
|
immutable_options_.clock = env_->GetSystemClock().get();
|
Fix many tests to run with MEM_ENV and ENCRYPTED_ENV; Introduce a MemoryFileSystem class (#7566)
Summary:
This PR does a few things:
1. The MockFileSystem class was split out from the MockEnv. This change would theoretically allow a MockFileSystem to be used by other Environments as well (if we created a means of constructing one). The MockFileSystem implements a FileSystem in its entirety and does not rely on any Wrapper implementation.
2. Make the RocksDB test suite work when MOCK_ENV=1 and ENCRYPTED_ENV=1 are set. To accomplish this, a few things were needed:
- The tests that tried to use the "wrong" environment (Env::Default() instead of env_) were updated
- The MockFileSystem was changed to support the features it was missing or mishandled (such as recursively deleting files in a directory or supporting renaming of a directory).
3. Updated the test framework to have a ROCKSDB_GTEST_SKIP macro. This can be used to flag tests that are skipped. Currently, this defaults to doing nothing (marks the test as SUCCESS) but will mark the tests as SKIPPED when RocksDB is upgraded to a version of gtest that supports this (gtest-1.10).
I have run a full "make check" with MEM_ENV, ENCRYPTED_ENV, both, and neither under both MacOS and RedHat. A few tests were disabled/skipped for the MEM/ENCRYPTED cases. The error_handler_fs_test fails/hangs for MEM_ENV (presumably a timing problem) and I will introduce another PR/issue to track that problem. (I will also push a change to disable those tests soon). There is one more test in DBTest2 that also fails which I need to investigate or skip before this PR is merged.
Theoretically, this PR should also allow the test suite to run against an Env loaded from the registry, though I do not have one to try it with currently.
Finally, once this is accepted, it would be nice if there was a CircleCI job to run these tests on a checkin so this effort does not become stale. I do not know how to do that, so if someone could write that job, it would be appreciated :)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7566
Reviewed By: zhichao-cao
Differential Revision: D24408980
Pulled By: jay-zhuang
fbshipit-source-id: 911b1554a4d0da06fd51feca0c090a4abdcb4a5f
2020-10-27 17:31:34 +00:00
|
|
|
|
2024-10-17 21:13:20 +00:00
|
|
|
cf_options_.table_factory = table_factory_;
|
|
|
|
mutable_cf_options_.table_factory = table_factory_;
|
|
|
|
|
2023-10-27 22:56:48 +00:00
|
|
|
versions_.reset(new VersionSet(
|
|
|
|
dbname_, &db_options_, env_options_, table_cache_.get(),
|
|
|
|
&write_buffer_manager_, &write_controller_,
|
|
|
|
/*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
|
2023-11-11 16:11:11 +00:00
|
|
|
/*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"",
|
2023-12-29 02:25:29 +00:00
|
|
|
/*error_handler=*/nullptr, /*read_only=*/false));
|
2020-03-21 02:17:54 +00:00
|
|
|
reactive_versions_ = std::make_shared<ReactiveVersionSet>(
|
|
|
|
dbname_, &db_options_, env_options_, table_cache_.get(),
|
2020-08-13 00:28:10 +00:00
|
|
|
&write_buffer_manager_, &write_controller_, nullptr);
|
2018-06-28 19:16:10 +00:00
|
|
|
db_options_.db_paths.emplace_back(dbname_,
|
|
|
|
std::numeric_limits<uint64_t>::max());
|
|
|
|
}
|
|
|
|
|
2020-03-21 02:17:54 +00:00
|
|
|
virtual ~VersionSetTestBase() {
|
|
|
|
if (getenv("KEEP_DB")) {
|
|
|
|
fprintf(stdout, "DB is still at %s\n", dbname_.c_str());
|
|
|
|
} else {
|
|
|
|
Options options;
|
|
|
|
options.env = env_;
|
|
|
|
EXPECT_OK(DestroyDB(dbname_, options));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
protected:
|
|
|
|
virtual void PrepareManifest(
|
|
|
|
std::vector<ColumnFamilyDescriptor>* column_families,
|
|
|
|
SequenceNumber* last_seqno, std::unique_ptr<log::Writer>* log_writer) {
|
2018-10-30 23:35:58 +00:00
|
|
|
assert(column_families != nullptr);
|
|
|
|
assert(last_seqno != nullptr);
|
|
|
|
assert(log_writer != nullptr);
|
2024-09-19 21:05:21 +00:00
|
|
|
ASSERT_OK(
|
|
|
|
SetIdentityFile(WriteOptions(), env_, dbname_, Temperature::kUnknown));
|
2018-06-28 19:16:10 +00:00
|
|
|
VersionEdit new_db;
|
2019-09-03 15:50:47 +00:00
|
|
|
if (db_options_.write_dbid_to_manifest) {
|
Fix many tests to run with MEM_ENV and ENCRYPTED_ENV; Introduce a MemoryFileSystem class (#7566)
Summary:
This PR does a few things:
1. The MockFileSystem class was split out from the MockEnv. This change would theoretically allow a MockFileSystem to be used by other Environments as well (if we created a means of constructing one). The MockFileSystem implements a FileSystem in its entirety and does not rely on any Wrapper implementation.
2. Make the RocksDB test suite work when MOCK_ENV=1 and ENCRYPTED_ENV=1 are set. To accomplish this, a few things were needed:
- The tests that tried to use the "wrong" environment (Env::Default() instead of env_) were updated
- The MockFileSystem was changed to support the features it was missing or mishandled (such as recursively deleting files in a directory or supporting renaming of a directory).
3. Updated the test framework to have a ROCKSDB_GTEST_SKIP macro. This can be used to flag tests that are skipped. Currently, this defaults to doing nothing (marks the test as SUCCESS) but will mark the tests as SKIPPED when RocksDB is upgraded to a version of gtest that supports this (gtest-1.10).
I have run a full "make check" with MEM_ENV, ENCRYPTED_ENV, both, and neither under both MacOS and RedHat. A few tests were disabled/skipped for the MEM/ENCRYPTED cases. The error_handler_fs_test fails/hangs for MEM_ENV (presumably a timing problem) and I will introduce another PR/issue to track that problem. (I will also push a change to disable those tests soon). There is one more test in DBTest2 that also fails which I need to investigate or skip before this PR is merged.
Theoretically, this PR should also allow the test suite to run against an Env loaded from the registry, though I do not have one to try it with currently.
Finally, once this is accepted, it would be nice if there was a CircleCI job to run these tests on a checkin so this effort does not become stale. I do not know how to do that, so if someone could write that job, it would be appreciated :)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7566
Reviewed By: zhichao-cao
Differential Revision: D24408980
Pulled By: jay-zhuang
fbshipit-source-id: 911b1554a4d0da06fd51feca0c090a4abdcb4a5f
2020-10-27 17:31:34 +00:00
|
|
|
DBOptions tmp_db_options;
|
|
|
|
tmp_db_options.env = env_;
|
|
|
|
std::unique_ptr<DBImpl> impl(new DBImpl(tmp_db_options, dbname_));
|
2019-09-03 15:50:47 +00:00
|
|
|
std::string db_id;
|
2023-08-09 22:46:44 +00:00
|
|
|
ASSERT_OK(impl->GetDbIdentityFromIdentityFile(&db_id));
|
2019-09-03 15:50:47 +00:00
|
|
|
new_db.SetDBId(db_id);
|
|
|
|
}
|
2018-06-28 19:16:10 +00:00
|
|
|
new_db.SetLogNumber(0);
|
|
|
|
new_db.SetNextFile(2);
|
|
|
|
new_db.SetLastSequence(0);
|
|
|
|
|
2018-12-13 23:10:16 +00:00
|
|
|
const std::vector<std::string> cf_names = {
|
|
|
|
kDefaultColumnFamilyName, kColumnFamilyName1, kColumnFamilyName2,
|
|
|
|
kColumnFamilyName3};
|
2018-06-28 19:16:10 +00:00
|
|
|
const int kInitialNumOfCfs = static_cast<int>(cf_names.size());
|
|
|
|
autovector<VersionEdit> new_cfs;
|
|
|
|
uint64_t last_seq = 1;
|
|
|
|
uint32_t cf_id = 1;
|
|
|
|
for (int i = 1; i != kInitialNumOfCfs; ++i) {
|
|
|
|
VersionEdit new_cf;
|
|
|
|
new_cf.AddColumnFamily(cf_names[i]);
|
|
|
|
new_cf.SetColumnFamily(cf_id++);
|
|
|
|
new_cf.SetLogNumber(0);
|
|
|
|
new_cf.SetNextFile(2);
|
|
|
|
new_cf.SetLastSequence(last_seq++);
|
|
|
|
new_cfs.emplace_back(new_cf);
|
|
|
|
}
|
2018-10-30 23:35:58 +00:00
|
|
|
*last_seqno = last_seq;
|
2019-06-04 17:51:22 +00:00
|
|
|
num_initial_edits_ = static_cast<int>(new_cfs.size() + 1);
|
2021-01-29 06:08:46 +00:00
|
|
|
std::unique_ptr<WritableFileWriter> file_writer;
|
2018-06-28 19:16:10 +00:00
|
|
|
const std::string manifest = DescriptorFileName(dbname_, 1);
|
2021-01-29 06:08:46 +00:00
|
|
|
const auto& fs = env_->GetFileSystem();
|
|
|
|
Status s = WritableFileWriter::Create(
|
|
|
|
fs, manifest, fs->OptimizeForManifestWrite(env_options_), &file_writer,
|
|
|
|
nullptr);
|
2018-06-28 19:16:10 +00:00
|
|
|
ASSERT_OK(s);
|
|
|
|
{
|
2018-10-30 23:35:58 +00:00
|
|
|
log_writer->reset(new log::Writer(std::move(file_writer), 0, false));
|
2018-06-28 19:16:10 +00:00
|
|
|
std::string record;
|
|
|
|
new_db.EncodeTo(&record);
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
s = (*log_writer)->AddRecord(WriteOptions(), record);
|
2018-06-28 19:16:10 +00:00
|
|
|
for (const auto& e : new_cfs) {
|
2018-10-30 23:35:58 +00:00
|
|
|
record.clear();
|
2018-06-28 19:16:10 +00:00
|
|
|
e.EncodeTo(&record);
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
s = (*log_writer)->AddRecord(WriteOptions(), record);
|
2018-06-28 19:16:10 +00:00
|
|
|
ASSERT_OK(s);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
Best-effort recovery support for atomic flush (#12406)
Summary:
This PR updates `VersionEditHandlerPointInTime` to recover all or none of the updates in an AtomicGroup. This makes best-effort recovery properly handle atomic flushes during recovery, so the features are now allowed to both be enabled at once.
The new logic requires that AtomicGroups do not contain column family additions or removals. AtomicGroups are currently written for atomic flush, which does not include such edits.
Column family additions or removals are recovered independently of AtomicGroups. The new logic needs to be aware of removal, though, so that a dropped CF does not prevent completion of an AtomicGroup recovery.
The new logic treats each AtomicGroup as if it contains updates for all existing column families, even though it is possible to create AtomicGroups that only affect a subset of column families. This simplifies the logic at the expense of recovering less data in certain edge case scenarios.
The usage of `MaybeCreateVersion()` is pretty tricky. The goal is to create a barrier at the start of an AtomicGroup such that all valid states up to that point will be applied to `versions_`. Here is a summary.
- `MaybeCreateVersion(..., false)` creates a `Version` on a negative edge trigger (transition from valid to invalid). It was previously called when applying each update. Now, it is only called when applying non-AtomicGroup updates.
- `MaybeCreateVersion(..., true)` creates a `Version` on a positive level trigger (valid state). It was previously called only at the end of iteration. Now, it is additionally called before processing an AtomicGroup.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12406
Reviewed By: jaykorean, cbi42
Differential Revision: D54494904
Pulled By: ajkr
fbshipit-source-id: 0114a9fe1d04b471d086dcab5978ea8a3a56ad52
2024-03-06 22:40:40 +00:00
|
|
|
cf_options_.table_factory = table_factory_;
|
2018-06-28 19:16:10 +00:00
|
|
|
for (const auto& cf_name : cf_names) {
|
2018-10-30 23:35:58 +00:00
|
|
|
column_families->emplace_back(cf_name, cf_options_);
|
2018-06-28 19:16:10 +00:00
|
|
|
}
|
2018-10-30 23:35:58 +00:00
|
|
|
}
|
|
|
|
|
Best-effort recovery support for atomic flush (#12406)
Summary:
This PR updates `VersionEditHandlerPointInTime` to recover all or none of the updates in an AtomicGroup. This makes best-effort recovery properly handle atomic flushes during recovery, so the features are now allowed to both be enabled at once.
The new logic requires that AtomicGroups do not contain column family additions or removals. AtomicGroups are currently written for atomic flush, which does not include such edits.
Column family additions or removals are recovered independently of AtomicGroups. The new logic needs to be aware of removal, though, so that a dropped CF does not prevent completion of an AtomicGroup recovery.
The new logic treats each AtomicGroup as if it contains updates for all existing column families, even though it is possible to create AtomicGroups that only affect a subset of column families. This simplifies the logic at the expense of recovering less data in certain edge case scenarios.
The usage of `MaybeCreateVersion()` is pretty tricky. The goal is to create a barrier at the start of an AtomicGroup such that all valid states up to that point will be applied to `versions_`. Here is a summary.
- `MaybeCreateVersion(..., false)` creates a `Version` on a negative edge trigger (transition from valid to invalid). It was previously called when applying each update. Now, it is only called when applying non-AtomicGroup updates.
- `MaybeCreateVersion(..., true)` creates a `Version` on a positive level trigger (valid state). It was previously called only at the end of iteration. Now, it is additionally called before processing an AtomicGroup.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12406
Reviewed By: jaykorean, cbi42
Differential Revision: D54494904
Pulled By: ajkr
fbshipit-source-id: 0114a9fe1d04b471d086dcab5978ea8a3a56ad52
2024-03-06 22:40:40 +00:00
|
|
|
struct SstInfo {
|
|
|
|
uint64_t file_number;
|
|
|
|
std::string column_family;
|
|
|
|
std::string key; // the only key
|
|
|
|
int level = 0;
|
|
|
|
uint64_t epoch_number;
|
2024-08-17 00:18:54 +00:00
|
|
|
bool file_missing = false;
|
|
|
|
uint64_t oldest_blob_file_number = kInvalidBlobFileNumber;
|
Best-effort recovery support for atomic flush (#12406)
Summary:
This PR updates `VersionEditHandlerPointInTime` to recover all or none of the updates in an AtomicGroup. This makes best-effort recovery properly handle atomic flushes during recovery, so the features are now allowed to both be enabled at once.
The new logic requires that AtomicGroups do not contain column family additions or removals. AtomicGroups are currently written for atomic flush, which does not include such edits.
Column family additions or removals are recovered independently of AtomicGroups. The new logic needs to be aware of removal, though, so that a dropped CF does not prevent completion of an AtomicGroup recovery.
The new logic treats each AtomicGroup as if it contains updates for all existing column families, even though it is possible to create AtomicGroups that only affect a subset of column families. This simplifies the logic at the expense of recovering less data in certain edge case scenarios.
The usage of `MaybeCreateVersion()` is pretty tricky. The goal is to create a barrier at the start of an AtomicGroup such that all valid states up to that point will be applied to `versions_`. Here is a summary.
- `MaybeCreateVersion(..., false)` creates a `Version` on a negative edge trigger (transition from valid to invalid). It was previously called when applying each update. Now, it is only called when applying non-AtomicGroup updates.
- `MaybeCreateVersion(..., true)` creates a `Version` on a positive level trigger (valid state). It was previously called only at the end of iteration. Now, it is additionally called before processing an AtomicGroup.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12406
Reviewed By: jaykorean, cbi42
Differential Revision: D54494904
Pulled By: ajkr
fbshipit-source-id: 0114a9fe1d04b471d086dcab5978ea8a3a56ad52
2024-03-06 22:40:40 +00:00
|
|
|
SstInfo(uint64_t file_num, const std::string& cf_name,
|
|
|
|
const std::string& _key,
|
2024-08-17 00:18:54 +00:00
|
|
|
uint64_t _epoch_number = kUnknownEpochNumber,
|
|
|
|
bool _file_missing = false,
|
|
|
|
uint64_t _oldest_blob_file_number = kInvalidBlobFileNumber)
|
|
|
|
: SstInfo(file_num, cf_name, _key, 0, _epoch_number, _file_missing,
|
|
|
|
_oldest_blob_file_number) {}
|
Best-effort recovery support for atomic flush (#12406)
Summary:
This PR updates `VersionEditHandlerPointInTime` to recover all or none of the updates in an AtomicGroup. This makes best-effort recovery properly handle atomic flushes during recovery, so the features are now allowed to both be enabled at once.
The new logic requires that AtomicGroups do not contain column family additions or removals. AtomicGroups are currently written for atomic flush, which does not include such edits.
Column family additions or removals are recovered independently of AtomicGroups. The new logic needs to be aware of removal, though, so that a dropped CF does not prevent completion of an AtomicGroup recovery.
The new logic treats each AtomicGroup as if it contains updates for all existing column families, even though it is possible to create AtomicGroups that only affect a subset of column families. This simplifies the logic at the expense of recovering less data in certain edge case scenarios.
The usage of `MaybeCreateVersion()` is pretty tricky. The goal is to create a barrier at the start of an AtomicGroup such that all valid states up to that point will be applied to `versions_`. Here is a summary.
- `MaybeCreateVersion(..., false)` creates a `Version` on a negative edge trigger (transition from valid to invalid). It was previously called when applying each update. Now, it is only called when applying non-AtomicGroup updates.
- `MaybeCreateVersion(..., true)` creates a `Version` on a positive level trigger (valid state). It was previously called only at the end of iteration. Now, it is additionally called before processing an AtomicGroup.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12406
Reviewed By: jaykorean, cbi42
Differential Revision: D54494904
Pulled By: ajkr
fbshipit-source-id: 0114a9fe1d04b471d086dcab5978ea8a3a56ad52
2024-03-06 22:40:40 +00:00
|
|
|
SstInfo(uint64_t file_num, const std::string& cf_name,
|
|
|
|
const std::string& _key, int lvl,
|
2024-08-17 00:18:54 +00:00
|
|
|
uint64_t _epoch_number = kUnknownEpochNumber,
|
|
|
|
bool _file_missing = false,
|
|
|
|
uint64_t _oldest_blob_file_number = kInvalidBlobFileNumber)
|
Best-effort recovery support for atomic flush (#12406)
Summary:
This PR updates `VersionEditHandlerPointInTime` to recover all or none of the updates in an AtomicGroup. This makes best-effort recovery properly handle atomic flushes during recovery, so the features are now allowed to both be enabled at once.
The new logic requires that AtomicGroups do not contain column family additions or removals. AtomicGroups are currently written for atomic flush, which does not include such edits.
Column family additions or removals are recovered independently of AtomicGroups. The new logic needs to be aware of removal, though, so that a dropped CF does not prevent completion of an AtomicGroup recovery.
The new logic treats each AtomicGroup as if it contains updates for all existing column families, even though it is possible to create AtomicGroups that only affect a subset of column families. This simplifies the logic at the expense of recovering less data in certain edge case scenarios.
The usage of `MaybeCreateVersion()` is pretty tricky. The goal is to create a barrier at the start of an AtomicGroup such that all valid states up to that point will be applied to `versions_`. Here is a summary.
- `MaybeCreateVersion(..., false)` creates a `Version` on a negative edge trigger (transition from valid to invalid). It was previously called when applying each update. Now, it is only called when applying non-AtomicGroup updates.
- `MaybeCreateVersion(..., true)` creates a `Version` on a positive level trigger (valid state). It was previously called only at the end of iteration. Now, it is additionally called before processing an AtomicGroup.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12406
Reviewed By: jaykorean, cbi42
Differential Revision: D54494904
Pulled By: ajkr
fbshipit-source-id: 0114a9fe1d04b471d086dcab5978ea8a3a56ad52
2024-03-06 22:40:40 +00:00
|
|
|
: file_number(file_num),
|
|
|
|
column_family(cf_name),
|
|
|
|
key(_key),
|
|
|
|
level(lvl),
|
2024-08-17 00:18:54 +00:00
|
|
|
epoch_number(_epoch_number),
|
|
|
|
file_missing(_file_missing),
|
|
|
|
oldest_blob_file_number(_oldest_blob_file_number) {}
|
Best-effort recovery support for atomic flush (#12406)
Summary:
This PR updates `VersionEditHandlerPointInTime` to recover all or none of the updates in an AtomicGroup. This makes best-effort recovery properly handle atomic flushes during recovery, so the features are now allowed to both be enabled at once.
The new logic requires that AtomicGroups do not contain column family additions or removals. AtomicGroups are currently written for atomic flush, which does not include such edits.
Column family additions or removals are recovered independently of AtomicGroups. The new logic needs to be aware of removal, though, so that a dropped CF does not prevent completion of an AtomicGroup recovery.
The new logic treats each AtomicGroup as if it contains updates for all existing column families, even though it is possible to create AtomicGroups that only affect a subset of column families. This simplifies the logic at the expense of recovering less data in certain edge case scenarios.
The usage of `MaybeCreateVersion()` is pretty tricky. The goal is to create a barrier at the start of an AtomicGroup such that all valid states up to that point will be applied to `versions_`. Here is a summary.
- `MaybeCreateVersion(..., false)` creates a `Version` on a negative edge trigger (transition from valid to invalid). It was previously called when applying each update. Now, it is only called when applying non-AtomicGroup updates.
- `MaybeCreateVersion(..., true)` creates a `Version` on a positive level trigger (valid state). It was previously called only at the end of iteration. Now, it is additionally called before processing an AtomicGroup.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12406
Reviewed By: jaykorean, cbi42
Differential Revision: D54494904
Pulled By: ajkr
fbshipit-source-id: 0114a9fe1d04b471d086dcab5978ea8a3a56ad52
2024-03-06 22:40:40 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
// Create dummy sst, return their metadata. Note that only file name and size
|
|
|
|
// are used.
|
|
|
|
void CreateDummyTableFiles(const std::vector<SstInfo>& file_infos,
|
|
|
|
std::vector<FileMetaData>* file_metas) {
|
|
|
|
assert(file_metas != nullptr);
|
|
|
|
for (const auto& info : file_infos) {
|
|
|
|
uint64_t file_num = info.file_number;
|
|
|
|
std::string fname = MakeTableFileName(dbname_, file_num);
|
|
|
|
std::unique_ptr<FSWritableFile> file;
|
|
|
|
Status s = fs_->NewWritableFile(fname, FileOptions(), &file, nullptr);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
std::unique_ptr<WritableFileWriter> fwriter(new WritableFileWriter(
|
|
|
|
std::move(file), fname, FileOptions(), env_->GetSystemClock().get()));
|
|
|
|
InternalTblPropCollFactories internal_tbl_prop_coll_factories;
|
|
|
|
|
|
|
|
const ReadOptions read_options;
|
|
|
|
const WriteOptions write_options;
|
|
|
|
std::unique_ptr<TableBuilder> builder(table_factory_->NewTableBuilder(
|
|
|
|
TableBuilderOptions(
|
|
|
|
immutable_options_, mutable_cf_options_, read_options,
|
|
|
|
write_options, InternalKeyComparator(options_.comparator),
|
|
|
|
&internal_tbl_prop_coll_factories, kNoCompression,
|
|
|
|
CompressionOptions(),
|
|
|
|
TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
|
2024-11-01 17:08:35 +00:00
|
|
|
info.column_family, info.level, kUnknownNewestKeyTime),
|
Best-effort recovery support for atomic flush (#12406)
Summary:
This PR updates `VersionEditHandlerPointInTime` to recover all or none of the updates in an AtomicGroup. This makes best-effort recovery properly handle atomic flushes during recovery, so the features are now allowed to both be enabled at once.
The new logic requires that AtomicGroups do not contain column family additions or removals. AtomicGroups are currently written for atomic flush, which does not include such edits.
Column family additions or removals are recovered independently of AtomicGroups. The new logic needs to be aware of removal, though, so that a dropped CF does not prevent completion of an AtomicGroup recovery.
The new logic treats each AtomicGroup as if it contains updates for all existing column families, even though it is possible to create AtomicGroups that only affect a subset of column families. This simplifies the logic at the expense of recovering less data in certain edge case scenarios.
The usage of `MaybeCreateVersion()` is pretty tricky. The goal is to create a barrier at the start of an AtomicGroup such that all valid states up to that point will be applied to `versions_`. Here is a summary.
- `MaybeCreateVersion(..., false)` creates a `Version` on a negative edge trigger (transition from valid to invalid). It was previously called when applying each update. Now, it is only called when applying non-AtomicGroup updates.
- `MaybeCreateVersion(..., true)` creates a `Version` on a positive level trigger (valid state). It was previously called only at the end of iteration. Now, it is additionally called before processing an AtomicGroup.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12406
Reviewed By: jaykorean, cbi42
Differential Revision: D54494904
Pulled By: ajkr
fbshipit-source-id: 0114a9fe1d04b471d086dcab5978ea8a3a56ad52
2024-03-06 22:40:40 +00:00
|
|
|
fwriter.get()));
|
|
|
|
InternalKey ikey(info.key, 0, ValueType::kTypeValue);
|
|
|
|
builder->Add(ikey.Encode(), "value");
|
|
|
|
ASSERT_OK(builder->Finish());
|
|
|
|
ASSERT_OK(fwriter->Flush(IOOptions()));
|
|
|
|
uint64_t file_size = 0;
|
|
|
|
s = fs_->GetFileSize(fname, IOOptions(), &file_size, nullptr);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_NE(0, file_size);
|
|
|
|
file_metas->emplace_back(
|
|
|
|
file_num, /*file_path_id=*/0, file_size, ikey, ikey, 0, 0, false,
|
2024-08-17 00:18:54 +00:00
|
|
|
Temperature::kUnknown, info.oldest_blob_file_number, 0, 0,
|
|
|
|
info.epoch_number, kUnknownFileChecksum, kUnknownFileChecksumFuncName,
|
|
|
|
kNullUniqueId64x2, 0, 0,
|
|
|
|
/* user_defined_timestamps_persisted */ true);
|
|
|
|
if (info.file_missing) {
|
|
|
|
ASSERT_OK(fs_->DeleteFile(fname, IOOptions(), nullptr));
|
|
|
|
}
|
Best-effort recovery support for atomic flush (#12406)
Summary:
This PR updates `VersionEditHandlerPointInTime` to recover all or none of the updates in an AtomicGroup. This makes best-effort recovery properly handle atomic flushes during recovery, so the features are now allowed to both be enabled at once.
The new logic requires that AtomicGroups do not contain column family additions or removals. AtomicGroups are currently written for atomic flush, which does not include such edits.
Column family additions or removals are recovered independently of AtomicGroups. The new logic needs to be aware of removal, though, so that a dropped CF does not prevent completion of an AtomicGroup recovery.
The new logic treats each AtomicGroup as if it contains updates for all existing column families, even though it is possible to create AtomicGroups that only affect a subset of column families. This simplifies the logic at the expense of recovering less data in certain edge case scenarios.
The usage of `MaybeCreateVersion()` is pretty tricky. The goal is to create a barrier at the start of an AtomicGroup such that all valid states up to that point will be applied to `versions_`. Here is a summary.
- `MaybeCreateVersion(..., false)` creates a `Version` on a negative edge trigger (transition from valid to invalid). It was previously called when applying each update. Now, it is only called when applying non-AtomicGroup updates.
- `MaybeCreateVersion(..., true)` creates a `Version` on a positive level trigger (valid state). It was previously called only at the end of iteration. Now, it is additionally called before processing an AtomicGroup.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12406
Reviewed By: jaykorean, cbi42
Differential Revision: D54494904
Pulled By: ajkr
fbshipit-source-id: 0114a9fe1d04b471d086dcab5978ea8a3a56ad52
2024-03-06 22:40:40 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-08-24 02:49:25 +00:00
|
|
|
void CreateCurrentFile() {
|
|
|
|
// Make "CURRENT" file point to the new manifest file.
|
|
|
|
ASSERT_OK(SetCurrentFile(WriteOptions(), fs_.get(), dbname_, 1,
|
|
|
|
Temperature::kUnknown,
|
|
|
|
/* dir_contains_current_file */ nullptr));
|
|
|
|
}
|
|
|
|
|
2018-10-30 23:35:58 +00:00
|
|
|
// Create DB with 3 column families.
|
|
|
|
void NewDB() {
|
|
|
|
SequenceNumber last_seqno;
|
|
|
|
std::unique_ptr<log::Writer> log_writer;
|
2020-10-24 05:48:00 +00:00
|
|
|
PrepareManifest(&column_families_, &last_seqno, &log_writer);
|
2018-10-30 23:35:58 +00:00
|
|
|
log_writer.reset();
|
2024-08-24 02:49:25 +00:00
|
|
|
CreateCurrentFile();
|
2018-06-28 19:16:10 +00:00
|
|
|
|
2020-10-24 05:48:00 +00:00
|
|
|
EXPECT_OK(versions_->Recover(column_families_, false));
|
|
|
|
EXPECT_EQ(column_families_.size(),
|
2018-06-28 19:16:10 +00:00
|
|
|
versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
|
|
|
|
}
|
|
|
|
|
2023-12-29 02:25:29 +00:00
|
|
|
void CloseDB() {
|
|
|
|
mutex_.Lock();
|
|
|
|
versions_->Close(nullptr, &mutex_).PermitUncheckedError();
|
|
|
|
versions_.reset();
|
|
|
|
mutex_.Unlock();
|
|
|
|
}
|
|
|
|
|
2020-12-10 03:05:14 +00:00
|
|
|
void ReopenDB() {
|
2023-10-27 22:56:48 +00:00
|
|
|
versions_.reset(new VersionSet(
|
|
|
|
dbname_, &db_options_, env_options_, table_cache_.get(),
|
|
|
|
&write_buffer_manager_, &write_controller_,
|
|
|
|
/*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
|
2023-11-11 16:11:11 +00:00
|
|
|
/*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"",
|
2023-12-29 02:25:29 +00:00
|
|
|
/*error_handler=*/nullptr, /*read_only=*/false));
|
2020-12-10 03:05:14 +00:00
|
|
|
EXPECT_OK(versions_->Recover(column_families_, false));
|
|
|
|
}
|
|
|
|
|
2023-12-29 02:25:29 +00:00
|
|
|
void GetManifestPath(std::string* manifest_path) const {
|
|
|
|
assert(manifest_path != nullptr);
|
|
|
|
uint64_t manifest_file_number = 0;
|
|
|
|
Status s = versions_->GetCurrentManifestPath(
|
|
|
|
dbname_, fs_.get(), manifest_path, &manifest_file_number);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
}
|
|
|
|
|
2020-03-21 02:17:54 +00:00
|
|
|
void VerifyManifest(std::string* manifest_path) const {
|
|
|
|
assert(manifest_path != nullptr);
|
|
|
|
uint64_t manifest_file_number = 0;
|
|
|
|
Status s = versions_->GetCurrentManifestPath(
|
|
|
|
dbname_, fs_.get(), manifest_path, &manifest_file_number);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ(1, manifest_file_number);
|
|
|
|
}
|
|
|
|
|
2020-10-24 05:48:00 +00:00
|
|
|
Status LogAndApplyToDefaultCF(VersionEdit& edit) {
|
|
|
|
mutex_.Lock();
|
2023-04-21 16:07:18 +00:00
|
|
|
Status s = versions_->LogAndApply(
|
|
|
|
versions_->GetColumnFamilySet()->GetDefault(), mutable_cf_options_,
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
read_options_, write_options_, &edit, &mutex_, nullptr);
|
2020-10-24 05:48:00 +00:00
|
|
|
mutex_.Unlock();
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status LogAndApplyToDefaultCF(
|
|
|
|
const autovector<std::unique_ptr<VersionEdit>>& edits) {
|
|
|
|
autovector<VersionEdit*> vedits;
|
|
|
|
for (auto& e : edits) {
|
|
|
|
vedits.push_back(e.get());
|
|
|
|
}
|
|
|
|
mutex_.Lock();
|
2023-04-21 16:07:18 +00:00
|
|
|
Status s = versions_->LogAndApply(
|
|
|
|
versions_->GetColumnFamilySet()->GetDefault(), mutable_cf_options_,
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
read_options_, write_options_, vedits, &mutex_, nullptr);
|
2020-10-24 05:48:00 +00:00
|
|
|
mutex_.Unlock();
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
void CreateNewManifest() {
|
|
|
|
constexpr FSDirectory* db_directory = nullptr;
|
|
|
|
constexpr bool new_descriptor_log = true;
|
|
|
|
mutex_.Lock();
|
|
|
|
VersionEdit dummy;
|
|
|
|
ASSERT_OK(versions_->LogAndApply(
|
|
|
|
versions_->GetColumnFamilySet()->GetDefault(), mutable_cf_options_,
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
read_options_, write_options_, &dummy, &mutex_, db_directory,
|
|
|
|
new_descriptor_log));
|
2020-10-24 05:48:00 +00:00
|
|
|
mutex_.Unlock();
|
|
|
|
}
|
|
|
|
|
2020-12-05 22:17:11 +00:00
|
|
|
ColumnFamilyData* CreateColumnFamily(const std::string& cf_name,
|
|
|
|
const ColumnFamilyOptions& cf_options) {
|
|
|
|
VersionEdit new_cf;
|
|
|
|
new_cf.AddColumnFamily(cf_name);
|
|
|
|
uint32_t new_id = versions_->GetColumnFamilySet()->GetNextColumnFamilyID();
|
|
|
|
new_cf.SetColumnFamily(new_id);
|
|
|
|
new_cf.SetLogNumber(0);
|
|
|
|
new_cf.SetComparatorName(cf_options.comparator->Name());
|
2023-07-27 03:16:32 +00:00
|
|
|
new_cf.SetPersistUserDefinedTimestamps(
|
|
|
|
cf_options.persist_user_defined_timestamps);
|
2020-12-05 22:17:11 +00:00
|
|
|
Status s;
|
|
|
|
mutex_.Lock();
|
|
|
|
s = versions_->LogAndApply(/*column_family_data=*/nullptr,
|
2023-04-21 16:07:18 +00:00
|
|
|
MutableCFOptions(cf_options), read_options_,
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
write_options_, &new_cf, &mutex_,
|
2020-12-05 22:17:11 +00:00
|
|
|
/*db_directory=*/nullptr,
|
|
|
|
/*new_descriptor_log=*/false, &cf_options);
|
|
|
|
mutex_.Unlock();
|
|
|
|
EXPECT_OK(s);
|
|
|
|
ColumnFamilyData* cfd =
|
|
|
|
versions_->GetColumnFamilySet()->GetColumnFamily(cf_name);
|
|
|
|
EXPECT_NE(nullptr, cfd);
|
|
|
|
return cfd;
|
|
|
|
}
|
|
|
|
|
Fix many tests to run with MEM_ENV and ENCRYPTED_ENV; Introduce a MemoryFileSystem class (#7566)
Summary:
This PR does a few things:
1. The MockFileSystem class was split out from the MockEnv. This change would theoretically allow a MockFileSystem to be used by other Environments as well (if we created a means of constructing one). The MockFileSystem implements a FileSystem in its entirety and does not rely on any Wrapper implementation.
2. Make the RocksDB test suite work when MOCK_ENV=1 and ENCRYPTED_ENV=1 are set. To accomplish this, a few things were needed:
- The tests that tried to use the "wrong" environment (Env::Default() instead of env_) were updated
- The MockFileSystem was changed to support the features it was missing or mishandled (such as recursively deleting files in a directory or supporting renaming of a directory).
3. Updated the test framework to have a ROCKSDB_GTEST_SKIP macro. This can be used to flag tests that are skipped. Currently, this defaults to doing nothing (marks the test as SUCCESS) but will mark the tests as SKIPPED when RocksDB is upgraded to a version of gtest that supports this (gtest-1.10).
I have run a full "make check" with MEM_ENV, ENCRYPTED_ENV, both, and neither under both MacOS and RedHat. A few tests were disabled/skipped for the MEM/ENCRYPTED cases. The error_handler_fs_test fails/hangs for MEM_ENV (presumably a timing problem) and I will introduce another PR/issue to track that problem. (I will also push a change to disable those tests soon). There is one more test in DBTest2 that also fails which I need to investigate or skip before this PR is merged.
Theoretically, this PR should also allow the test suite to run against an Env loaded from the registry, though I do not have one to try it with currently.
Finally, once this is accepted, it would be nice if there was a CircleCI job to run these tests on a checkin so this effort does not become stale. I do not know how to do that, so if someone could write that job, it would be appreciated :)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7566
Reviewed By: zhichao-cao
Differential Revision: D24408980
Pulled By: jay-zhuang
fbshipit-source-id: 911b1554a4d0da06fd51feca0c090a4abdcb4a5f
2020-10-27 17:31:34 +00:00
|
|
|
Env* mem_env_;
|
2018-06-28 19:16:10 +00:00
|
|
|
Env* env_;
|
2020-03-21 02:17:54 +00:00
|
|
|
std::shared_ptr<Env> env_guard_;
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
2019-12-13 22:47:08 +00:00
|
|
|
std::shared_ptr<FileSystem> fs_;
|
2018-06-28 19:16:10 +00:00
|
|
|
const std::string dbname_;
|
|
|
|
EnvOptions env_options_;
|
2020-03-21 02:17:54 +00:00
|
|
|
Options options_;
|
2018-06-28 19:16:10 +00:00
|
|
|
ImmutableDBOptions db_options_;
|
|
|
|
ColumnFamilyOptions cf_options_;
|
2021-06-16 23:50:43 +00:00
|
|
|
ImmutableOptions immutable_options_;
|
2018-06-28 19:16:10 +00:00
|
|
|
MutableCFOptions mutable_cf_options_;
|
2023-04-21 16:07:18 +00:00
|
|
|
const ReadOptions read_options_;
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
const WriteOptions write_options_;
|
|
|
|
|
2018-06-28 19:16:10 +00:00
|
|
|
std::shared_ptr<Cache> table_cache_;
|
|
|
|
WriteController write_controller_;
|
|
|
|
WriteBufferManager write_buffer_manager_;
|
2018-10-30 23:35:58 +00:00
|
|
|
std::shared_ptr<VersionSet> versions_;
|
2019-06-04 17:51:22 +00:00
|
|
|
std::shared_ptr<ReactiveVersionSet> reactive_versions_;
|
2018-06-28 19:16:10 +00:00
|
|
|
InstrumentedMutex mutex_;
|
|
|
|
std::atomic<bool> shutting_down_;
|
Best-effort recovery support for atomic flush (#12406)
Summary:
This PR updates `VersionEditHandlerPointInTime` to recover all or none of the updates in an AtomicGroup. This makes best-effort recovery properly handle atomic flushes during recovery, so the features are now allowed to both be enabled at once.
The new logic requires that AtomicGroups do not contain column family additions or removals. AtomicGroups are currently written for atomic flush, which does not include such edits.
Column family additions or removals are recovered independently of AtomicGroups. The new logic needs to be aware of removal, though, so that a dropped CF does not prevent completion of an AtomicGroup recovery.
The new logic treats each AtomicGroup as if it contains updates for all existing column families, even though it is possible to create AtomicGroups that only affect a subset of column families. This simplifies the logic at the expense of recovering less data in certain edge case scenarios.
The usage of `MaybeCreateVersion()` is pretty tricky. The goal is to create a barrier at the start of an AtomicGroup such that all valid states up to that point will be applied to `versions_`. Here is a summary.
- `MaybeCreateVersion(..., false)` creates a `Version` on a negative edge trigger (transition from valid to invalid). It was previously called when applying each update. Now, it is only called when applying non-AtomicGroup updates.
- `MaybeCreateVersion(..., true)` creates a `Version` on a positive level trigger (valid state). It was previously called only at the end of iteration. Now, it is additionally called before processing an AtomicGroup.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12406
Reviewed By: jaykorean, cbi42
Differential Revision: D54494904
Pulled By: ajkr
fbshipit-source-id: 0114a9fe1d04b471d086dcab5978ea8a3a56ad52
2024-03-06 22:40:40 +00:00
|
|
|
std::shared_ptr<TableFactory> table_factory_;
|
2020-10-24 05:48:00 +00:00
|
|
|
std::vector<ColumnFamilyDescriptor> column_families_;
|
2018-06-28 19:16:10 +00:00
|
|
|
};
|
|
|
|
|
2018-12-13 23:10:16 +00:00
|
|
|
const std::string VersionSetTestBase::kColumnFamilyName1 = "alice";
|
|
|
|
const std::string VersionSetTestBase::kColumnFamilyName2 = "bob";
|
|
|
|
const std::string VersionSetTestBase::kColumnFamilyName3 = "charles";
|
|
|
|
|
|
|
|
class VersionSetTest : public VersionSetTestBase, public testing::Test {
|
|
|
|
public:
|
2020-03-21 02:17:54 +00:00
|
|
|
VersionSetTest() : VersionSetTestBase("version_set_test") {}
|
2018-12-13 23:10:16 +00:00
|
|
|
};
|
|
|
|
|
2018-10-30 23:35:58 +00:00
|
|
|
TEST_F(VersionSetTest, SameColumnFamilyGroupCommit) {
|
2018-06-28 19:16:10 +00:00
|
|
|
NewDB();
|
|
|
|
const int kGroupSize = 5;
|
2023-04-21 16:07:18 +00:00
|
|
|
const ReadOptions read_options;
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
const WriteOptions write_options;
|
2023-04-21 16:07:18 +00:00
|
|
|
|
2018-10-16 02:59:20 +00:00
|
|
|
autovector<VersionEdit> edits;
|
2018-06-28 19:16:10 +00:00
|
|
|
for (int i = 0; i != kGroupSize; ++i) {
|
2018-10-16 02:59:20 +00:00
|
|
|
edits.emplace_back(VersionEdit());
|
|
|
|
}
|
|
|
|
autovector<ColumnFamilyData*> cfds;
|
|
|
|
autovector<const MutableCFOptions*> all_mutable_cf_options;
|
|
|
|
autovector<autovector<VersionEdit*>> edit_lists;
|
|
|
|
for (int i = 0; i != kGroupSize; ++i) {
|
2018-10-30 23:35:58 +00:00
|
|
|
cfds.emplace_back(versions_->GetColumnFamilySet()->GetDefault());
|
2018-10-16 02:59:20 +00:00
|
|
|
all_mutable_cf_options.emplace_back(&mutable_cf_options_);
|
|
|
|
autovector<VersionEdit*> edit_list;
|
|
|
|
edit_list.emplace_back(&edits[i]);
|
|
|
|
edit_lists.emplace_back(edit_list);
|
2018-06-28 19:16:10 +00:00
|
|
|
}
|
|
|
|
|
2018-10-30 23:35:58 +00:00
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
SyncPoint::GetInstance()->ClearAllCallBacks();
|
2018-06-28 19:16:10 +00:00
|
|
|
int count = 0;
|
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"VersionSet::ProcessManifestWrites:SameColumnFamily", [&](void* arg) {
|
Prefer static_cast in place of most reinterpret_cast (#12308)
Summary:
The following are risks associated with pointer-to-pointer reinterpret_cast:
* Can produce the "wrong result" (crash or memory corruption). IIRC, in theory this can happen for any up-cast or down-cast for a non-standard-layout type, though in practice would only happen for multiple inheritance cases (where the base class pointer might be "inside" the derived object). We don't use multiple inheritance a lot, but we do.
* Can mask useful compiler errors upon code change, including converting between unrelated pointer types that you are expecting to be related, and converting between pointer and scalar types unintentionally.
I can only think of some obscure cases where static_cast could be troublesome when it compiles as a replacement:
* Going through `void*` could plausibly cause unnecessary or broken pointer arithmetic. Suppose we have
`struct Derived: public Base1, public Base2`. If we have `Derived*` -> `void*` -> `Base2*` -> `Derived*` through reinterpret casts, this could plausibly work (though technical UB) assuming the `Base2*` is not dereferenced. Changing to static cast could introduce breaking pointer arithmetic.
* Unnecessary (but safe) pointer arithmetic could arise in a case like `Derived*` -> `Base2*` -> `Derived*` where before the Base2 pointer might not have been dereferenced. This could potentially affect performance.
With some light scripting, I tried replacing pointer-to-pointer reinterpret_casts with static_cast and kept the cases that still compile. Most occurrences of reinterpret_cast have successfully been changed (except for java/ and third-party/). 294 changed, 257 remain.
A couple of related interventions included here:
* Previously Cache::Handle was not actually derived from in the implementations and just used as a `void*` stand-in with reinterpret_cast. Now there is a relationship to allow static_cast. In theory, this could introduce pointer arithmetic (as described above) but is unlikely without multiple inheritance AND non-empty Cache::Handle.
* Remove some unnecessary casts to void* as this is allowed to be implicit (for better or worse).
Most of the remaining reinterpret_casts are for converting to/from raw bytes of objects. We could consider better idioms for these patterns in follow-up work.
I wish there were a way to implement a template variant of static_cast that would only compile if no pointer arithmetic is generated, but best I can tell, this is not possible. AFAIK the best you could do is a dynamic check that the void* conversion after the static cast is unchanged.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12308
Test Plan: existing tests, CI
Reviewed By: ltamasi
Differential Revision: D53204947
Pulled By: pdillinger
fbshipit-source-id: 9de23e618263b0d5b9820f4e15966876888a16e2
2024-02-07 18:44:11 +00:00
|
|
|
uint32_t* cf_id = static_cast<uint32_t*>(arg);
|
2019-09-09 18:22:28 +00:00
|
|
|
EXPECT_EQ(0u, *cf_id);
|
2018-06-28 19:16:10 +00:00
|
|
|
++count;
|
|
|
|
});
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
mutex_.Lock();
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
Status s =
|
|
|
|
versions_->LogAndApply(cfds, all_mutable_cf_options, read_options,
|
|
|
|
write_options, edit_lists, &mutex_, nullptr);
|
2018-06-28 19:16:10 +00:00
|
|
|
mutex_.Unlock();
|
|
|
|
EXPECT_OK(s);
|
|
|
|
EXPECT_EQ(kGroupSize - 1, count);
|
|
|
|
}
|
2018-10-30 23:35:58 +00:00
|
|
|
|
2020-04-02 18:51:17 +00:00
|
|
|
TEST_F(VersionSetTest, PersistBlobFileStateInNewManifest) {
|
|
|
|
// Initialize the database and add a couple of blob files, one with some
|
|
|
|
// garbage in it, and one without any garbage.
|
|
|
|
NewDB();
|
|
|
|
|
Clean up blob files based on the linked SST set (#7001)
Summary:
The earlier `VersionBuilder` code only cleaned up blob files that were
marked as entirely consisting of garbage using `VersionEdits` with
`BlobFileGarbage`. This covers the cases when table files go through
regular compaction, where we iterate through the KVs and thus have an
opportunity to calculate the amount of garbage (that is, most cases).
However, it does not help when table files are simply dropped (e.g. deletion
compactions or the `DeleteFile` API). To deal with such cases, the patch
adds logic that cleans up all blob files at the head of the list until the first
one with linked SSTs is found. (As an example, let's assume we have blob files
with numbers 1..10, and the first one with any linked SSTs is number 8.
This means that SSTs in the `Version` only rely on blob files with numbers >= 8,
and thus 1..7 are no longer needed.)
The code change itself is pretty small; however, changing the logic like this
necessitated changes to some tests that have been added recently (namely
to the ones that use blob files in isolation, i.e. without any table files referring
to them). Some of these cases were fixed by bypassing `VersionBuilder` altogether
in order to keep the tests simple (which actually makes them more proper unit tests
as well), while the `VersionBuilder` unit tests were fixed by adding dummy table
files to the test cases as needed.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7001
Test Plan: `make check`
Reviewed By: riversand963
Differential Revision: D22119474
Pulled By: ltamasi
fbshipit-source-id: c6547141355667d4291d9661d6518eb741e7b54a
2020-06-30 22:30:01 +00:00
|
|
|
assert(versions_);
|
|
|
|
assert(versions_->GetColumnFamilySet());
|
|
|
|
|
|
|
|
ColumnFamilyData* const cfd = versions_->GetColumnFamilySet()->GetDefault();
|
|
|
|
assert(cfd);
|
|
|
|
|
|
|
|
Version* const version = cfd->current();
|
|
|
|
assert(version);
|
|
|
|
|
|
|
|
VersionStorageInfo* const storage_info = version->storage_info();
|
|
|
|
assert(storage_info);
|
2020-04-02 18:51:17 +00:00
|
|
|
|
|
|
|
{
|
|
|
|
constexpr uint64_t blob_file_number = 123;
|
|
|
|
constexpr uint64_t total_blob_count = 456;
|
|
|
|
constexpr uint64_t total_blob_bytes = 77777777;
|
|
|
|
constexpr char checksum_method[] = "SHA1";
|
|
|
|
constexpr char checksum_value[] =
|
2021-06-22 16:48:50 +00:00
|
|
|
"\xbd\xb7\xf3\x4a\x59\xdf\xa1\x59\x2c\xe7\xf5\x2e\x99\xf9\x8c\x57\x0c"
|
|
|
|
"\x52\x5c\xbd";
|
Clean up blob files based on the linked SST set (#7001)
Summary:
The earlier `VersionBuilder` code only cleaned up blob files that were
marked as entirely consisting of garbage using `VersionEdits` with
`BlobFileGarbage`. This covers the cases when table files go through
regular compaction, where we iterate through the KVs and thus have an
opportunity to calculate the amount of garbage (that is, most cases).
However, it does not help when table files are simply dropped (e.g. deletion
compactions or the `DeleteFile` API). To deal with such cases, the patch
adds logic that cleans up all blob files at the head of the list until the first
one with linked SSTs is found. (As an example, let's assume we have blob files
with numbers 1..10, and the first one with any linked SSTs is number 8.
This means that SSTs in the `Version` only rely on blob files with numbers >= 8,
and thus 1..7 are no longer needed.)
The code change itself is pretty small; however, changing the logic like this
necessitated changes to some tests that have been added recently (namely
to the ones that use blob files in isolation, i.e. without any table files referring
to them). Some of these cases were fixed by bypassing `VersionBuilder` altogether
in order to keep the tests simple (which actually makes them more proper unit tests
as well), while the `VersionBuilder` unit tests were fixed by adding dummy table
files to the test cases as needed.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7001
Test Plan: `make check`
Reviewed By: riversand963
Differential Revision: D22119474
Pulled By: ltamasi
fbshipit-source-id: c6547141355667d4291d9661d6518eb741e7b54a
2020-06-30 22:30:01 +00:00
|
|
|
|
|
|
|
auto shared_meta = SharedBlobFileMetaData::Create(
|
|
|
|
blob_file_number, total_blob_count, total_blob_bytes, checksum_method,
|
|
|
|
checksum_value);
|
|
|
|
|
2020-04-02 18:51:17 +00:00
|
|
|
constexpr uint64_t garbage_blob_count = 89;
|
|
|
|
constexpr uint64_t garbage_blob_bytes = 1000000;
|
|
|
|
|
Clean up blob files based on the linked SST set (#7001)
Summary:
The earlier `VersionBuilder` code only cleaned up blob files that were
marked as entirely consisting of garbage using `VersionEdits` with
`BlobFileGarbage`. This covers the cases when table files go through
regular compaction, where we iterate through the KVs and thus have an
opportunity to calculate the amount of garbage (that is, most cases).
However, it does not help when table files are simply dropped (e.g. deletion
compactions or the `DeleteFile` API). To deal with such cases, the patch
adds logic that cleans up all blob files at the head of the list until the first
one with linked SSTs is found. (As an example, let's assume we have blob files
with numbers 1..10, and the first one with any linked SSTs is number 8.
This means that SSTs in the `Version` only rely on blob files with numbers >= 8,
and thus 1..7 are no longer needed.)
The code change itself is pretty small; however, changing the logic like this
necessitated changes to some tests that have been added recently (namely
to the ones that use blob files in isolation, i.e. without any table files referring
to them). Some of these cases were fixed by bypassing `VersionBuilder` altogether
in order to keep the tests simple (which actually makes them more proper unit tests
as well), while the `VersionBuilder` unit tests were fixed by adding dummy table
files to the test cases as needed.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7001
Test Plan: `make check`
Reviewed By: riversand963
Differential Revision: D22119474
Pulled By: ltamasi
fbshipit-source-id: c6547141355667d4291d9661d6518eb741e7b54a
2020-06-30 22:30:01 +00:00
|
|
|
auto meta = BlobFileMetaData::Create(
|
|
|
|
std::move(shared_meta), BlobFileMetaData::LinkedSsts(),
|
|
|
|
garbage_blob_count, garbage_blob_bytes);
|
|
|
|
|
|
|
|
storage_info->AddBlobFile(std::move(meta));
|
2020-04-02 18:51:17 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
constexpr uint64_t blob_file_number = 234;
|
|
|
|
constexpr uint64_t total_blob_count = 555;
|
|
|
|
constexpr uint64_t total_blob_bytes = 66666;
|
|
|
|
constexpr char checksum_method[] = "CRC32";
|
2021-06-22 16:48:50 +00:00
|
|
|
constexpr char checksum_value[] = "\x3d\x87\xff\x57";
|
2020-04-02 18:51:17 +00:00
|
|
|
|
Clean up blob files based on the linked SST set (#7001)
Summary:
The earlier `VersionBuilder` code only cleaned up blob files that were
marked as entirely consisting of garbage using `VersionEdits` with
`BlobFileGarbage`. This covers the cases when table files go through
regular compaction, where we iterate through the KVs and thus have an
opportunity to calculate the amount of garbage (that is, most cases).
However, it does not help when table files are simply dropped (e.g. deletion
compactions or the `DeleteFile` API). To deal with such cases, the patch
adds logic that cleans up all blob files at the head of the list until the first
one with linked SSTs is found. (As an example, let's assume we have blob files
with numbers 1..10, and the first one with any linked SSTs is number 8.
This means that SSTs in the `Version` only rely on blob files with numbers >= 8,
and thus 1..7 are no longer needed.)
The code change itself is pretty small; however, changing the logic like this
necessitated changes to some tests that have been added recently (namely
to the ones that use blob files in isolation, i.e. without any table files referring
to them). Some of these cases were fixed by bypassing `VersionBuilder` altogether
in order to keep the tests simple (which actually makes them more proper unit tests
as well), while the `VersionBuilder` unit tests were fixed by adding dummy table
files to the test cases as needed.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7001
Test Plan: `make check`
Reviewed By: riversand963
Differential Revision: D22119474
Pulled By: ltamasi
fbshipit-source-id: c6547141355667d4291d9661d6518eb741e7b54a
2020-06-30 22:30:01 +00:00
|
|
|
auto shared_meta = SharedBlobFileMetaData::Create(
|
|
|
|
blob_file_number, total_blob_count, total_blob_bytes, checksum_method,
|
|
|
|
checksum_value);
|
2020-04-02 18:51:17 +00:00
|
|
|
|
Clean up blob files based on the linked SST set (#7001)
Summary:
The earlier `VersionBuilder` code only cleaned up blob files that were
marked as entirely consisting of garbage using `VersionEdits` with
`BlobFileGarbage`. This covers the cases when table files go through
regular compaction, where we iterate through the KVs and thus have an
opportunity to calculate the amount of garbage (that is, most cases).
However, it does not help when table files are simply dropped (e.g. deletion
compactions or the `DeleteFile` API). To deal with such cases, the patch
adds logic that cleans up all blob files at the head of the list until the first
one with linked SSTs is found. (As an example, let's assume we have blob files
with numbers 1..10, and the first one with any linked SSTs is number 8.
This means that SSTs in the `Version` only rely on blob files with numbers >= 8,
and thus 1..7 are no longer needed.)
The code change itself is pretty small; however, changing the logic like this
necessitated changes to some tests that have been added recently (namely
to the ones that use blob files in isolation, i.e. without any table files referring
to them). Some of these cases were fixed by bypassing `VersionBuilder` altogether
in order to keep the tests simple (which actually makes them more proper unit tests
as well), while the `VersionBuilder` unit tests were fixed by adding dummy table
files to the test cases as needed.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7001
Test Plan: `make check`
Reviewed By: riversand963
Differential Revision: D22119474
Pulled By: ltamasi
fbshipit-source-id: c6547141355667d4291d9661d6518eb741e7b54a
2020-06-30 22:30:01 +00:00
|
|
|
constexpr uint64_t garbage_blob_count = 0;
|
|
|
|
constexpr uint64_t garbage_blob_bytes = 0;
|
2020-04-02 18:51:17 +00:00
|
|
|
|
Clean up blob files based on the linked SST set (#7001)
Summary:
The earlier `VersionBuilder` code only cleaned up blob files that were
marked as entirely consisting of garbage using `VersionEdits` with
`BlobFileGarbage`. This covers the cases when table files go through
regular compaction, where we iterate through the KVs and thus have an
opportunity to calculate the amount of garbage (that is, most cases).
However, it does not help when table files are simply dropped (e.g. deletion
compactions or the `DeleteFile` API). To deal with such cases, the patch
adds logic that cleans up all blob files at the head of the list until the first
one with linked SSTs is found. (As an example, let's assume we have blob files
with numbers 1..10, and the first one with any linked SSTs is number 8.
This means that SSTs in the `Version` only rely on blob files with numbers >= 8,
and thus 1..7 are no longer needed.)
The code change itself is pretty small; however, changing the logic like this
necessitated changes to some tests that have been added recently (namely
to the ones that use blob files in isolation, i.e. without any table files referring
to them). Some of these cases were fixed by bypassing `VersionBuilder` altogether
in order to keep the tests simple (which actually makes them more proper unit tests
as well), while the `VersionBuilder` unit tests were fixed by adding dummy table
files to the test cases as needed.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7001
Test Plan: `make check`
Reviewed By: riversand963
Differential Revision: D22119474
Pulled By: ltamasi
fbshipit-source-id: c6547141355667d4291d9661d6518eb741e7b54a
2020-06-30 22:30:01 +00:00
|
|
|
auto meta = BlobFileMetaData::Create(
|
|
|
|
std::move(shared_meta), BlobFileMetaData::LinkedSsts(),
|
|
|
|
garbage_blob_count, garbage_blob_bytes);
|
2020-04-02 18:51:17 +00:00
|
|
|
|
Clean up blob files based on the linked SST set (#7001)
Summary:
The earlier `VersionBuilder` code only cleaned up blob files that were
marked as entirely consisting of garbage using `VersionEdits` with
`BlobFileGarbage`. This covers the cases when table files go through
regular compaction, where we iterate through the KVs and thus have an
opportunity to calculate the amount of garbage (that is, most cases).
However, it does not help when table files are simply dropped (e.g. deletion
compactions or the `DeleteFile` API). To deal with such cases, the patch
adds logic that cleans up all blob files at the head of the list until the first
one with linked SSTs is found. (As an example, let's assume we have blob files
with numbers 1..10, and the first one with any linked SSTs is number 8.
This means that SSTs in the `Version` only rely on blob files with numbers >= 8,
and thus 1..7 are no longer needed.)
The code change itself is pretty small; however, changing the logic like this
necessitated changes to some tests that have been added recently (namely
to the ones that use blob files in isolation, i.e. without any table files referring
to them). Some of these cases were fixed by bypassing `VersionBuilder` altogether
in order to keep the tests simple (which actually makes them more proper unit tests
as well), while the `VersionBuilder` unit tests were fixed by adding dummy table
files to the test cases as needed.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7001
Test Plan: `make check`
Reviewed By: riversand963
Differential Revision: D22119474
Pulled By: ltamasi
fbshipit-source-id: c6547141355667d4291d9661d6518eb741e7b54a
2020-06-30 22:30:01 +00:00
|
|
|
storage_info->AddBlobFile(std::move(meta));
|
|
|
|
}
|
2020-04-02 18:51:17 +00:00
|
|
|
|
|
|
|
// Force the creation of a new manifest file and make sure metadata for
|
|
|
|
// the blob files is re-persisted.
|
|
|
|
size_t addition_encoded = 0;
|
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"BlobFileAddition::EncodeTo::CustomFields",
|
|
|
|
[&](void* /* arg */) { ++addition_encoded; });
|
|
|
|
|
|
|
|
size_t garbage_encoded = 0;
|
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"BlobFileGarbage::EncodeTo::CustomFields",
|
|
|
|
[&](void* /* arg */) { ++garbage_encoded; });
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
2020-10-24 05:48:00 +00:00
|
|
|
CreateNewManifest();
|
2020-04-02 18:51:17 +00:00
|
|
|
|
|
|
|
ASSERT_EQ(addition_encoded, 2);
|
|
|
|
ASSERT_EQ(garbage_encoded, 1);
|
|
|
|
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
SyncPoint::GetInstance()->ClearAllCallBacks();
|
|
|
|
}
|
|
|
|
|
2020-05-04 22:05:34 +00:00
|
|
|
TEST_F(VersionSetTest, AddLiveBlobFiles) {
|
|
|
|
// Initialize the database and add a blob file.
|
|
|
|
NewDB();
|
|
|
|
|
|
|
|
assert(versions_);
|
|
|
|
assert(versions_->GetColumnFamilySet());
|
|
|
|
|
|
|
|
ColumnFamilyData* const cfd = versions_->GetColumnFamilySet()->GetDefault();
|
|
|
|
assert(cfd);
|
|
|
|
|
Clean up blob files based on the linked SST set (#7001)
Summary:
The earlier `VersionBuilder` code only cleaned up blob files that were
marked as entirely consisting of garbage using `VersionEdits` with
`BlobFileGarbage`. This covers the cases when table files go through
regular compaction, where we iterate through the KVs and thus have an
opportunity to calculate the amount of garbage (that is, most cases).
However, it does not help when table files are simply dropped (e.g. deletion
compactions or the `DeleteFile` API). To deal with such cases, the patch
adds logic that cleans up all blob files at the head of the list until the first
one with linked SSTs is found. (As an example, let's assume we have blob files
with numbers 1..10, and the first one with any linked SSTs is number 8.
This means that SSTs in the `Version` only rely on blob files with numbers >= 8,
and thus 1..7 are no longer needed.)
The code change itself is pretty small; however, changing the logic like this
necessitated changes to some tests that have been added recently (namely
to the ones that use blob files in isolation, i.e. without any table files referring
to them). Some of these cases were fixed by bypassing `VersionBuilder` altogether
in order to keep the tests simple (which actually makes them more proper unit tests
as well), while the `VersionBuilder` unit tests were fixed by adding dummy table
files to the test cases as needed.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7001
Test Plan: `make check`
Reviewed By: riversand963
Differential Revision: D22119474
Pulled By: ltamasi
fbshipit-source-id: c6547141355667d4291d9661d6518eb741e7b54a
2020-06-30 22:30:01 +00:00
|
|
|
Version* const first_version = cfd->current();
|
|
|
|
assert(first_version);
|
|
|
|
|
|
|
|
VersionStorageInfo* const first_storage_info = first_version->storage_info();
|
|
|
|
assert(first_storage_info);
|
2020-05-04 22:05:34 +00:00
|
|
|
|
|
|
|
constexpr uint64_t first_blob_file_number = 234;
|
|
|
|
constexpr uint64_t first_total_blob_count = 555;
|
|
|
|
constexpr uint64_t first_total_blob_bytes = 66666;
|
|
|
|
constexpr char first_checksum_method[] = "CRC32";
|
2021-06-22 16:48:50 +00:00
|
|
|
constexpr char first_checksum_value[] = "\x3d\x87\xff\x57";
|
2020-05-04 22:05:34 +00:00
|
|
|
|
Clean up blob files based on the linked SST set (#7001)
Summary:
The earlier `VersionBuilder` code only cleaned up blob files that were
marked as entirely consisting of garbage using `VersionEdits` with
`BlobFileGarbage`. This covers the cases when table files go through
regular compaction, where we iterate through the KVs and thus have an
opportunity to calculate the amount of garbage (that is, most cases).
However, it does not help when table files are simply dropped (e.g. deletion
compactions or the `DeleteFile` API). To deal with such cases, the patch
adds logic that cleans up all blob files at the head of the list until the first
one with linked SSTs is found. (As an example, let's assume we have blob files
with numbers 1..10, and the first one with any linked SSTs is number 8.
This means that SSTs in the `Version` only rely on blob files with numbers >= 8,
and thus 1..7 are no longer needed.)
The code change itself is pretty small; however, changing the logic like this
necessitated changes to some tests that have been added recently (namely
to the ones that use blob files in isolation, i.e. without any table files referring
to them). Some of these cases were fixed by bypassing `VersionBuilder` altogether
in order to keep the tests simple (which actually makes them more proper unit tests
as well), while the `VersionBuilder` unit tests were fixed by adding dummy table
files to the test cases as needed.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7001
Test Plan: `make check`
Reviewed By: riversand963
Differential Revision: D22119474
Pulled By: ltamasi
fbshipit-source-id: c6547141355667d4291d9661d6518eb741e7b54a
2020-06-30 22:30:01 +00:00
|
|
|
auto first_shared_meta = SharedBlobFileMetaData::Create(
|
|
|
|
first_blob_file_number, first_total_blob_count, first_total_blob_bytes,
|
|
|
|
first_checksum_method, first_checksum_value);
|
2020-05-04 22:05:34 +00:00
|
|
|
|
Clean up blob files based on the linked SST set (#7001)
Summary:
The earlier `VersionBuilder` code only cleaned up blob files that were
marked as entirely consisting of garbage using `VersionEdits` with
`BlobFileGarbage`. This covers the cases when table files go through
regular compaction, where we iterate through the KVs and thus have an
opportunity to calculate the amount of garbage (that is, most cases).
However, it does not help when table files are simply dropped (e.g. deletion
compactions or the `DeleteFile` API). To deal with such cases, the patch
adds logic that cleans up all blob files at the head of the list until the first
one with linked SSTs is found. (As an example, let's assume we have blob files
with numbers 1..10, and the first one with any linked SSTs is number 8.
This means that SSTs in the `Version` only rely on blob files with numbers >= 8,
and thus 1..7 are no longer needed.)
The code change itself is pretty small; however, changing the logic like this
necessitated changes to some tests that have been added recently (namely
to the ones that use blob files in isolation, i.e. without any table files referring
to them). Some of these cases were fixed by bypassing `VersionBuilder` altogether
in order to keep the tests simple (which actually makes them more proper unit tests
as well), while the `VersionBuilder` unit tests were fixed by adding dummy table
files to the test cases as needed.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7001
Test Plan: `make check`
Reviewed By: riversand963
Differential Revision: D22119474
Pulled By: ltamasi
fbshipit-source-id: c6547141355667d4291d9661d6518eb741e7b54a
2020-06-30 22:30:01 +00:00
|
|
|
constexpr uint64_t garbage_blob_count = 0;
|
|
|
|
constexpr uint64_t garbage_blob_bytes = 0;
|
2020-05-04 22:05:34 +00:00
|
|
|
|
Clean up blob files based on the linked SST set (#7001)
Summary:
The earlier `VersionBuilder` code only cleaned up blob files that were
marked as entirely consisting of garbage using `VersionEdits` with
`BlobFileGarbage`. This covers the cases when table files go through
regular compaction, where we iterate through the KVs and thus have an
opportunity to calculate the amount of garbage (that is, most cases).
However, it does not help when table files are simply dropped (e.g. deletion
compactions or the `DeleteFile` API). To deal with such cases, the patch
adds logic that cleans up all blob files at the head of the list until the first
one with linked SSTs is found. (As an example, let's assume we have blob files
with numbers 1..10, and the first one with any linked SSTs is number 8.
This means that SSTs in the `Version` only rely on blob files with numbers >= 8,
and thus 1..7 are no longer needed.)
The code change itself is pretty small; however, changing the logic like this
necessitated changes to some tests that have been added recently (namely
to the ones that use blob files in isolation, i.e. without any table files referring
to them). Some of these cases were fixed by bypassing `VersionBuilder` altogether
in order to keep the tests simple (which actually makes them more proper unit tests
as well), while the `VersionBuilder` unit tests were fixed by adding dummy table
files to the test cases as needed.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7001
Test Plan: `make check`
Reviewed By: riversand963
Differential Revision: D22119474
Pulled By: ltamasi
fbshipit-source-id: c6547141355667d4291d9661d6518eb741e7b54a
2020-06-30 22:30:01 +00:00
|
|
|
auto first_meta = BlobFileMetaData::Create(
|
|
|
|
std::move(first_shared_meta), BlobFileMetaData::LinkedSsts(),
|
|
|
|
garbage_blob_count, garbage_blob_bytes);
|
|
|
|
|
|
|
|
first_storage_info->AddBlobFile(first_meta);
|
2020-05-04 22:05:34 +00:00
|
|
|
|
|
|
|
// Reference the version so it stays alive even after the following version
|
|
|
|
// edit.
|
Clean up blob files based on the linked SST set (#7001)
Summary:
The earlier `VersionBuilder` code only cleaned up blob files that were
marked as entirely consisting of garbage using `VersionEdits` with
`BlobFileGarbage`. This covers the cases when table files go through
regular compaction, where we iterate through the KVs and thus have an
opportunity to calculate the amount of garbage (that is, most cases).
However, it does not help when table files are simply dropped (e.g. deletion
compactions or the `DeleteFile` API). To deal with such cases, the patch
adds logic that cleans up all blob files at the head of the list until the first
one with linked SSTs is found. (As an example, let's assume we have blob files
with numbers 1..10, and the first one with any linked SSTs is number 8.
This means that SSTs in the `Version` only rely on blob files with numbers >= 8,
and thus 1..7 are no longer needed.)
The code change itself is pretty small; however, changing the logic like this
necessitated changes to some tests that have been added recently (namely
to the ones that use blob files in isolation, i.e. without any table files referring
to them). Some of these cases were fixed by bypassing `VersionBuilder` altogether
in order to keep the tests simple (which actually makes them more proper unit tests
as well), while the `VersionBuilder` unit tests were fixed by adding dummy table
files to the test cases as needed.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7001
Test Plan: `make check`
Reviewed By: riversand963
Differential Revision: D22119474
Pulled By: ltamasi
fbshipit-source-id: c6547141355667d4291d9661d6518eb741e7b54a
2020-06-30 22:30:01 +00:00
|
|
|
first_version->Ref();
|
2020-05-04 22:05:34 +00:00
|
|
|
|
|
|
|
// Get live files directly from version.
|
|
|
|
std::vector<uint64_t> version_table_files;
|
|
|
|
std::vector<uint64_t> version_blob_files;
|
|
|
|
|
Clean up blob files based on the linked SST set (#7001)
Summary:
The earlier `VersionBuilder` code only cleaned up blob files that were
marked as entirely consisting of garbage using `VersionEdits` with
`BlobFileGarbage`. This covers the cases when table files go through
regular compaction, where we iterate through the KVs and thus have an
opportunity to calculate the amount of garbage (that is, most cases).
However, it does not help when table files are simply dropped (e.g. deletion
compactions or the `DeleteFile` API). To deal with such cases, the patch
adds logic that cleans up all blob files at the head of the list until the first
one with linked SSTs is found. (As an example, let's assume we have blob files
with numbers 1..10, and the first one with any linked SSTs is number 8.
This means that SSTs in the `Version` only rely on blob files with numbers >= 8,
and thus 1..7 are no longer needed.)
The code change itself is pretty small; however, changing the logic like this
necessitated changes to some tests that have been added recently (namely
to the ones that use blob files in isolation, i.e. without any table files referring
to them). Some of these cases were fixed by bypassing `VersionBuilder` altogether
in order to keep the tests simple (which actually makes them more proper unit tests
as well), while the `VersionBuilder` unit tests were fixed by adding dummy table
files to the test cases as needed.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7001
Test Plan: `make check`
Reviewed By: riversand963
Differential Revision: D22119474
Pulled By: ltamasi
fbshipit-source-id: c6547141355667d4291d9661d6518eb741e7b54a
2020-06-30 22:30:01 +00:00
|
|
|
first_version->AddLiveFiles(&version_table_files, &version_blob_files);
|
2020-05-04 22:05:34 +00:00
|
|
|
|
|
|
|
ASSERT_EQ(version_blob_files.size(), 1);
|
|
|
|
ASSERT_EQ(version_blob_files[0], first_blob_file_number);
|
|
|
|
|
Clean up blob files based on the linked SST set (#7001)
Summary:
The earlier `VersionBuilder` code only cleaned up blob files that were
marked as entirely consisting of garbage using `VersionEdits` with
`BlobFileGarbage`. This covers the cases when table files go through
regular compaction, where we iterate through the KVs and thus have an
opportunity to calculate the amount of garbage (that is, most cases).
However, it does not help when table files are simply dropped (e.g. deletion
compactions or the `DeleteFile` API). To deal with such cases, the patch
adds logic that cleans up all blob files at the head of the list until the first
one with linked SSTs is found. (As an example, let's assume we have blob files
with numbers 1..10, and the first one with any linked SSTs is number 8.
This means that SSTs in the `Version` only rely on blob files with numbers >= 8,
and thus 1..7 are no longer needed.)
The code change itself is pretty small; however, changing the logic like this
necessitated changes to some tests that have been added recently (namely
to the ones that use blob files in isolation, i.e. without any table files referring
to them). Some of these cases were fixed by bypassing `VersionBuilder` altogether
in order to keep the tests simple (which actually makes them more proper unit tests
as well), while the `VersionBuilder` unit tests were fixed by adding dummy table
files to the test cases as needed.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7001
Test Plan: `make check`
Reviewed By: riversand963
Differential Revision: D22119474
Pulled By: ltamasi
fbshipit-source-id: c6547141355667d4291d9661d6518eb741e7b54a
2020-06-30 22:30:01 +00:00
|
|
|
// Create a new version containing an additional blob file.
|
|
|
|
versions_->TEST_CreateAndAppendVersion(cfd);
|
|
|
|
|
|
|
|
Version* const second_version = cfd->current();
|
|
|
|
assert(second_version);
|
|
|
|
assert(second_version != first_version);
|
|
|
|
|
|
|
|
VersionStorageInfo* const second_storage_info =
|
|
|
|
second_version->storage_info();
|
|
|
|
assert(second_storage_info);
|
2020-05-04 22:05:34 +00:00
|
|
|
|
|
|
|
constexpr uint64_t second_blob_file_number = 456;
|
|
|
|
constexpr uint64_t second_total_blob_count = 100;
|
|
|
|
constexpr uint64_t second_total_blob_bytes = 2000000;
|
|
|
|
constexpr char second_checksum_method[] = "CRC32B";
|
2021-06-22 16:48:50 +00:00
|
|
|
constexpr char second_checksum_value[] = "\x6d\xbd\xf2\x3a";
|
2020-05-04 22:05:34 +00:00
|
|
|
|
Clean up blob files based on the linked SST set (#7001)
Summary:
The earlier `VersionBuilder` code only cleaned up blob files that were
marked as entirely consisting of garbage using `VersionEdits` with
`BlobFileGarbage`. This covers the cases when table files go through
regular compaction, where we iterate through the KVs and thus have an
opportunity to calculate the amount of garbage (that is, most cases).
However, it does not help when table files are simply dropped (e.g. deletion
compactions or the `DeleteFile` API). To deal with such cases, the patch
adds logic that cleans up all blob files at the head of the list until the first
one with linked SSTs is found. (As an example, let's assume we have blob files
with numbers 1..10, and the first one with any linked SSTs is number 8.
This means that SSTs in the `Version` only rely on blob files with numbers >= 8,
and thus 1..7 are no longer needed.)
The code change itself is pretty small; however, changing the logic like this
necessitated changes to some tests that have been added recently (namely
to the ones that use blob files in isolation, i.e. without any table files referring
to them). Some of these cases were fixed by bypassing `VersionBuilder` altogether
in order to keep the tests simple (which actually makes them more proper unit tests
as well), while the `VersionBuilder` unit tests were fixed by adding dummy table
files to the test cases as needed.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7001
Test Plan: `make check`
Reviewed By: riversand963
Differential Revision: D22119474
Pulled By: ltamasi
fbshipit-source-id: c6547141355667d4291d9661d6518eb741e7b54a
2020-06-30 22:30:01 +00:00
|
|
|
auto second_shared_meta = SharedBlobFileMetaData::Create(
|
|
|
|
second_blob_file_number, second_total_blob_count, second_total_blob_bytes,
|
|
|
|
second_checksum_method, second_checksum_value);
|
2020-05-04 22:05:34 +00:00
|
|
|
|
Clean up blob files based on the linked SST set (#7001)
Summary:
The earlier `VersionBuilder` code only cleaned up blob files that were
marked as entirely consisting of garbage using `VersionEdits` with
`BlobFileGarbage`. This covers the cases when table files go through
regular compaction, where we iterate through the KVs and thus have an
opportunity to calculate the amount of garbage (that is, most cases).
However, it does not help when table files are simply dropped (e.g. deletion
compactions or the `DeleteFile` API). To deal with such cases, the patch
adds logic that cleans up all blob files at the head of the list until the first
one with linked SSTs is found. (As an example, let's assume we have blob files
with numbers 1..10, and the first one with any linked SSTs is number 8.
This means that SSTs in the `Version` only rely on blob files with numbers >= 8,
and thus 1..7 are no longer needed.)
The code change itself is pretty small; however, changing the logic like this
necessitated changes to some tests that have been added recently (namely
to the ones that use blob files in isolation, i.e. without any table files referring
to them). Some of these cases were fixed by bypassing `VersionBuilder` altogether
in order to keep the tests simple (which actually makes them more proper unit tests
as well), while the `VersionBuilder` unit tests were fixed by adding dummy table
files to the test cases as needed.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7001
Test Plan: `make check`
Reviewed By: riversand963
Differential Revision: D22119474
Pulled By: ltamasi
fbshipit-source-id: c6547141355667d4291d9661d6518eb741e7b54a
2020-06-30 22:30:01 +00:00
|
|
|
auto second_meta = BlobFileMetaData::Create(
|
|
|
|
std::move(second_shared_meta), BlobFileMetaData::LinkedSsts(),
|
|
|
|
garbage_blob_count, garbage_blob_bytes);
|
|
|
|
|
|
|
|
second_storage_info->AddBlobFile(std::move(first_meta));
|
|
|
|
second_storage_info->AddBlobFile(std::move(second_meta));
|
2020-05-04 22:05:34 +00:00
|
|
|
|
|
|
|
// Get all live files from version set. Note that the result contains
|
|
|
|
// duplicates.
|
|
|
|
std::vector<uint64_t> all_table_files;
|
|
|
|
std::vector<uint64_t> all_blob_files;
|
|
|
|
|
|
|
|
versions_->AddLiveFiles(&all_table_files, &all_blob_files);
|
|
|
|
|
|
|
|
ASSERT_EQ(all_blob_files.size(), 3);
|
|
|
|
ASSERT_EQ(all_blob_files[0], first_blob_file_number);
|
|
|
|
ASSERT_EQ(all_blob_files[1], first_blob_file_number);
|
|
|
|
ASSERT_EQ(all_blob_files[2], second_blob_file_number);
|
|
|
|
|
|
|
|
// Clean up previous version.
|
Clean up blob files based on the linked SST set (#7001)
Summary:
The earlier `VersionBuilder` code only cleaned up blob files that were
marked as entirely consisting of garbage using `VersionEdits` with
`BlobFileGarbage`. This covers the cases when table files go through
regular compaction, where we iterate through the KVs and thus have an
opportunity to calculate the amount of garbage (that is, most cases).
However, it does not help when table files are simply dropped (e.g. deletion
compactions or the `DeleteFile` API). To deal with such cases, the patch
adds logic that cleans up all blob files at the head of the list until the first
one with linked SSTs is found. (As an example, let's assume we have blob files
with numbers 1..10, and the first one with any linked SSTs is number 8.
This means that SSTs in the `Version` only rely on blob files with numbers >= 8,
and thus 1..7 are no longer needed.)
The code change itself is pretty small; however, changing the logic like this
necessitated changes to some tests that have been added recently (namely
to the ones that use blob files in isolation, i.e. without any table files referring
to them). Some of these cases were fixed by bypassing `VersionBuilder` altogether
in order to keep the tests simple (which actually makes them more proper unit tests
as well), while the `VersionBuilder` unit tests were fixed by adding dummy table
files to the test cases as needed.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7001
Test Plan: `make check`
Reviewed By: riversand963
Differential Revision: D22119474
Pulled By: ltamasi
fbshipit-source-id: c6547141355667d4291d9661d6518eb741e7b54a
2020-06-30 22:30:01 +00:00
|
|
|
first_version->Unref();
|
2020-05-04 22:05:34 +00:00
|
|
|
}
|
|
|
|
|
2020-04-30 18:23:32 +00:00
|
|
|
TEST_F(VersionSetTest, ObsoleteBlobFile) {
|
Clean up blob files based on the linked SST set (#7001)
Summary:
The earlier `VersionBuilder` code only cleaned up blob files that were
marked as entirely consisting of garbage using `VersionEdits` with
`BlobFileGarbage`. This covers the cases when table files go through
regular compaction, where we iterate through the KVs and thus have an
opportunity to calculate the amount of garbage (that is, most cases).
However, it does not help when table files are simply dropped (e.g. deletion
compactions or the `DeleteFile` API). To deal with such cases, the patch
adds logic that cleans up all blob files at the head of the list until the first
one with linked SSTs is found. (As an example, let's assume we have blob files
with numbers 1..10, and the first one with any linked SSTs is number 8.
This means that SSTs in the `Version` only rely on blob files with numbers >= 8,
and thus 1..7 are no longer needed.)
The code change itself is pretty small; however, changing the logic like this
necessitated changes to some tests that have been added recently (namely
to the ones that use blob files in isolation, i.e. without any table files referring
to them). Some of these cases were fixed by bypassing `VersionBuilder` altogether
in order to keep the tests simple (which actually makes them more proper unit tests
as well), while the `VersionBuilder` unit tests were fixed by adding dummy table
files to the test cases as needed.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7001
Test Plan: `make check`
Reviewed By: riversand963
Differential Revision: D22119474
Pulled By: ltamasi
fbshipit-source-id: c6547141355667d4291d9661d6518eb741e7b54a
2020-06-30 22:30:01 +00:00
|
|
|
// Initialize the database and add a blob file that is entirely garbage
|
|
|
|
// and thus can immediately be marked obsolete.
|
2020-04-30 18:23:32 +00:00
|
|
|
NewDB();
|
|
|
|
|
Clean up blob files based on the linked SST set (#7001)
Summary:
The earlier `VersionBuilder` code only cleaned up blob files that were
marked as entirely consisting of garbage using `VersionEdits` with
`BlobFileGarbage`. This covers the cases when table files go through
regular compaction, where we iterate through the KVs and thus have an
opportunity to calculate the amount of garbage (that is, most cases).
However, it does not help when table files are simply dropped (e.g. deletion
compactions or the `DeleteFile` API). To deal with such cases, the patch
adds logic that cleans up all blob files at the head of the list until the first
one with linked SSTs is found. (As an example, let's assume we have blob files
with numbers 1..10, and the first one with any linked SSTs is number 8.
This means that SSTs in the `Version` only rely on blob files with numbers >= 8,
and thus 1..7 are no longer needed.)
The code change itself is pretty small; however, changing the logic like this
necessitated changes to some tests that have been added recently (namely
to the ones that use blob files in isolation, i.e. without any table files referring
to them). Some of these cases were fixed by bypassing `VersionBuilder` altogether
in order to keep the tests simple (which actually makes them more proper unit tests
as well), while the `VersionBuilder` unit tests were fixed by adding dummy table
files to the test cases as needed.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7001
Test Plan: `make check`
Reviewed By: riversand963
Differential Revision: D22119474
Pulled By: ltamasi
fbshipit-source-id: c6547141355667d4291d9661d6518eb741e7b54a
2020-06-30 22:30:01 +00:00
|
|
|
VersionEdit edit;
|
2020-04-30 18:23:32 +00:00
|
|
|
|
|
|
|
constexpr uint64_t blob_file_number = 234;
|
|
|
|
constexpr uint64_t total_blob_count = 555;
|
|
|
|
constexpr uint64_t total_blob_bytes = 66666;
|
|
|
|
constexpr char checksum_method[] = "CRC32";
|
2021-06-22 16:48:50 +00:00
|
|
|
constexpr char checksum_value[] = "\x3d\x87\xff\x57";
|
2020-04-30 18:23:32 +00:00
|
|
|
|
Clean up blob files based on the linked SST set (#7001)
Summary:
The earlier `VersionBuilder` code only cleaned up blob files that were
marked as entirely consisting of garbage using `VersionEdits` with
`BlobFileGarbage`. This covers the cases when table files go through
regular compaction, where we iterate through the KVs and thus have an
opportunity to calculate the amount of garbage (that is, most cases).
However, it does not help when table files are simply dropped (e.g. deletion
compactions or the `DeleteFile` API). To deal with such cases, the patch
adds logic that cleans up all blob files at the head of the list until the first
one with linked SSTs is found. (As an example, let's assume we have blob files
with numbers 1..10, and the first one with any linked SSTs is number 8.
This means that SSTs in the `Version` only rely on blob files with numbers >= 8,
and thus 1..7 are no longer needed.)
The code change itself is pretty small; however, changing the logic like this
necessitated changes to some tests that have been added recently (namely
to the ones that use blob files in isolation, i.e. without any table files referring
to them). Some of these cases were fixed by bypassing `VersionBuilder` altogether
in order to keep the tests simple (which actually makes them more proper unit tests
as well), while the `VersionBuilder` unit tests were fixed by adding dummy table
files to the test cases as needed.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7001
Test Plan: `make check`
Reviewed By: riversand963
Differential Revision: D22119474
Pulled By: ltamasi
fbshipit-source-id: c6547141355667d4291d9661d6518eb741e7b54a
2020-06-30 22:30:01 +00:00
|
|
|
edit.AddBlobFile(blob_file_number, total_blob_count, total_blob_bytes,
|
|
|
|
checksum_method, checksum_value);
|
2020-04-30 18:23:32 +00:00
|
|
|
|
Clean up blob files based on the linked SST set (#7001)
Summary:
The earlier `VersionBuilder` code only cleaned up blob files that were
marked as entirely consisting of garbage using `VersionEdits` with
`BlobFileGarbage`. This covers the cases when table files go through
regular compaction, where we iterate through the KVs and thus have an
opportunity to calculate the amount of garbage (that is, most cases).
However, it does not help when table files are simply dropped (e.g. deletion
compactions or the `DeleteFile` API). To deal with such cases, the patch
adds logic that cleans up all blob files at the head of the list until the first
one with linked SSTs is found. (As an example, let's assume we have blob files
with numbers 1..10, and the first one with any linked SSTs is number 8.
This means that SSTs in the `Version` only rely on blob files with numbers >= 8,
and thus 1..7 are no longer needed.)
The code change itself is pretty small; however, changing the logic like this
necessitated changes to some tests that have been added recently (namely
to the ones that use blob files in isolation, i.e. without any table files referring
to them). Some of these cases were fixed by bypassing `VersionBuilder` altogether
in order to keep the tests simple (which actually makes them more proper unit tests
as well), while the `VersionBuilder` unit tests were fixed by adding dummy table
files to the test cases as needed.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7001
Test Plan: `make check`
Reviewed By: riversand963
Differential Revision: D22119474
Pulled By: ltamasi
fbshipit-source-id: c6547141355667d4291d9661d6518eb741e7b54a
2020-06-30 22:30:01 +00:00
|
|
|
edit.AddBlobFileGarbage(blob_file_number, total_blob_count, total_blob_bytes);
|
2020-04-30 18:23:32 +00:00
|
|
|
|
|
|
|
mutex_.Lock();
|
2023-04-21 16:07:18 +00:00
|
|
|
Status s = versions_->LogAndApply(
|
|
|
|
versions_->GetColumnFamilySet()->GetDefault(), mutable_cf_options_,
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
read_options_, write_options_, &edit, &mutex_, nullptr);
|
2020-04-30 18:23:32 +00:00
|
|
|
mutex_.Unlock();
|
|
|
|
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
// Make sure blob files from the pending number range are not returned
|
|
|
|
// as obsolete.
|
|
|
|
{
|
|
|
|
std::vector<ObsoleteFileInfo> table_files;
|
|
|
|
std::vector<ObsoleteBlobFileInfo> blob_files;
|
|
|
|
std::vector<std::string> manifest_files;
|
|
|
|
constexpr uint64_t min_pending_output = blob_file_number;
|
|
|
|
|
|
|
|
versions_->GetObsoleteFiles(&table_files, &blob_files, &manifest_files,
|
|
|
|
min_pending_output);
|
|
|
|
|
|
|
|
ASSERT_TRUE(blob_files.empty());
|
|
|
|
}
|
|
|
|
|
|
|
|
// Make sure the blob file is returned as obsolete if it's not in the pending
|
|
|
|
// range.
|
|
|
|
{
|
|
|
|
std::vector<ObsoleteFileInfo> table_files;
|
|
|
|
std::vector<ObsoleteBlobFileInfo> blob_files;
|
|
|
|
std::vector<std::string> manifest_files;
|
|
|
|
constexpr uint64_t min_pending_output = blob_file_number + 1;
|
|
|
|
|
|
|
|
versions_->GetObsoleteFiles(&table_files, &blob_files, &manifest_files,
|
|
|
|
min_pending_output);
|
|
|
|
|
|
|
|
ASSERT_EQ(blob_files.size(), 1);
|
|
|
|
ASSERT_EQ(blob_files[0].GetBlobFileNumber(), blob_file_number);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Make sure it's not returned a second time.
|
|
|
|
{
|
|
|
|
std::vector<ObsoleteFileInfo> table_files;
|
|
|
|
std::vector<ObsoleteBlobFileInfo> blob_files;
|
|
|
|
std::vector<std::string> manifest_files;
|
|
|
|
constexpr uint64_t min_pending_output = blob_file_number + 1;
|
|
|
|
|
|
|
|
versions_->GetObsoleteFiles(&table_files, &blob_files, &manifest_files,
|
|
|
|
min_pending_output);
|
|
|
|
|
|
|
|
ASSERT_TRUE(blob_files.empty());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-10-24 05:48:00 +00:00
|
|
|
TEST_F(VersionSetTest, WalEditsNotAppliedToVersion) {
|
|
|
|
NewDB();
|
|
|
|
|
|
|
|
constexpr uint64_t kNumWals = 5;
|
|
|
|
|
|
|
|
autovector<std::unique_ptr<VersionEdit>> edits;
|
|
|
|
// Add some WALs.
|
|
|
|
for (uint64_t i = 1; i <= kNumWals; i++) {
|
|
|
|
edits.emplace_back(new VersionEdit);
|
|
|
|
// WAL's size equals its log number.
|
|
|
|
edits.back()->AddWal(i, WalMetadata(i));
|
|
|
|
}
|
|
|
|
// Delete the first half of the WALs.
|
2020-11-07 00:30:44 +00:00
|
|
|
edits.emplace_back(new VersionEdit);
|
|
|
|
edits.back()->DeleteWalsBefore(kNumWals / 2 + 1);
|
2020-10-24 05:48:00 +00:00
|
|
|
|
|
|
|
autovector<Version*> versions;
|
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"VersionSet::ProcessManifestWrites:NewVersion",
|
Prefer static_cast in place of most reinterpret_cast (#12308)
Summary:
The following are risks associated with pointer-to-pointer reinterpret_cast:
* Can produce the "wrong result" (crash or memory corruption). IIRC, in theory this can happen for any up-cast or down-cast for a non-standard-layout type, though in practice would only happen for multiple inheritance cases (where the base class pointer might be "inside" the derived object). We don't use multiple inheritance a lot, but we do.
* Can mask useful compiler errors upon code change, including converting between unrelated pointer types that you are expecting to be related, and converting between pointer and scalar types unintentionally.
I can only think of some obscure cases where static_cast could be troublesome when it compiles as a replacement:
* Going through `void*` could plausibly cause unnecessary or broken pointer arithmetic. Suppose we have
`struct Derived: public Base1, public Base2`. If we have `Derived*` -> `void*` -> `Base2*` -> `Derived*` through reinterpret casts, this could plausibly work (though technical UB) assuming the `Base2*` is not dereferenced. Changing to static cast could introduce breaking pointer arithmetic.
* Unnecessary (but safe) pointer arithmetic could arise in a case like `Derived*` -> `Base2*` -> `Derived*` where before the Base2 pointer might not have been dereferenced. This could potentially affect performance.
With some light scripting, I tried replacing pointer-to-pointer reinterpret_casts with static_cast and kept the cases that still compile. Most occurrences of reinterpret_cast have successfully been changed (except for java/ and third-party/). 294 changed, 257 remain.
A couple of related interventions included here:
* Previously Cache::Handle was not actually derived from in the implementations and just used as a `void*` stand-in with reinterpret_cast. Now there is a relationship to allow static_cast. In theory, this could introduce pointer arithmetic (as described above) but is unlikely without multiple inheritance AND non-empty Cache::Handle.
* Remove some unnecessary casts to void* as this is allowed to be implicit (for better or worse).
Most of the remaining reinterpret_casts are for converting to/from raw bytes of objects. We could consider better idioms for these patterns in follow-up work.
I wish there were a way to implement a template variant of static_cast that would only compile if no pointer arithmetic is generated, but best I can tell, this is not possible. AFAIK the best you could do is a dynamic check that the void* conversion after the static cast is unchanged.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12308
Test Plan: existing tests, CI
Reviewed By: ltamasi
Differential Revision: D53204947
Pulled By: pdillinger
fbshipit-source-id: 9de23e618263b0d5b9820f4e15966876888a16e2
2024-02-07 18:44:11 +00:00
|
|
|
[&](void* arg) { versions.push_back(static_cast<Version*>(arg)); });
|
2020-10-24 05:48:00 +00:00
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
2020-12-23 07:44:44 +00:00
|
|
|
ASSERT_OK(LogAndApplyToDefaultCF(edits));
|
2020-10-24 05:48:00 +00:00
|
|
|
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
SyncPoint::GetInstance()->ClearAllCallBacks();
|
|
|
|
|
|
|
|
// Since the edits are all WAL edits, no version should be created.
|
|
|
|
ASSERT_EQ(versions.size(), 1);
|
|
|
|
ASSERT_EQ(versions[0], nullptr);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Similar to WalEditsNotAppliedToVersion, but contains a non-WAL edit.
|
|
|
|
TEST_F(VersionSetTest, NonWalEditsAppliedToVersion) {
|
|
|
|
NewDB();
|
|
|
|
|
|
|
|
const std::string kDBId = "db_db";
|
|
|
|
constexpr uint64_t kNumWals = 5;
|
|
|
|
|
|
|
|
autovector<std::unique_ptr<VersionEdit>> edits;
|
|
|
|
// Add some WALs.
|
|
|
|
for (uint64_t i = 1; i <= kNumWals; i++) {
|
|
|
|
edits.emplace_back(new VersionEdit);
|
|
|
|
// WAL's size equals its log number.
|
|
|
|
edits.back()->AddWal(i, WalMetadata(i));
|
|
|
|
}
|
|
|
|
// Delete the first half of the WALs.
|
2020-11-07 00:30:44 +00:00
|
|
|
edits.emplace_back(new VersionEdit);
|
|
|
|
edits.back()->DeleteWalsBefore(kNumWals / 2 + 1);
|
2020-10-24 05:48:00 +00:00
|
|
|
edits.emplace_back(new VersionEdit);
|
|
|
|
edits.back()->SetDBId(kDBId);
|
|
|
|
|
|
|
|
autovector<Version*> versions;
|
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"VersionSet::ProcessManifestWrites:NewVersion",
|
Prefer static_cast in place of most reinterpret_cast (#12308)
Summary:
The following are risks associated with pointer-to-pointer reinterpret_cast:
* Can produce the "wrong result" (crash or memory corruption). IIRC, in theory this can happen for any up-cast or down-cast for a non-standard-layout type, though in practice would only happen for multiple inheritance cases (where the base class pointer might be "inside" the derived object). We don't use multiple inheritance a lot, but we do.
* Can mask useful compiler errors upon code change, including converting between unrelated pointer types that you are expecting to be related, and converting between pointer and scalar types unintentionally.
I can only think of some obscure cases where static_cast could be troublesome when it compiles as a replacement:
* Going through `void*` could plausibly cause unnecessary or broken pointer arithmetic. Suppose we have
`struct Derived: public Base1, public Base2`. If we have `Derived*` -> `void*` -> `Base2*` -> `Derived*` through reinterpret casts, this could plausibly work (though technical UB) assuming the `Base2*` is not dereferenced. Changing to static cast could introduce breaking pointer arithmetic.
* Unnecessary (but safe) pointer arithmetic could arise in a case like `Derived*` -> `Base2*` -> `Derived*` where before the Base2 pointer might not have been dereferenced. This could potentially affect performance.
With some light scripting, I tried replacing pointer-to-pointer reinterpret_casts with static_cast and kept the cases that still compile. Most occurrences of reinterpret_cast have successfully been changed (except for java/ and third-party/). 294 changed, 257 remain.
A couple of related interventions included here:
* Previously Cache::Handle was not actually derived from in the implementations and just used as a `void*` stand-in with reinterpret_cast. Now there is a relationship to allow static_cast. In theory, this could introduce pointer arithmetic (as described above) but is unlikely without multiple inheritance AND non-empty Cache::Handle.
* Remove some unnecessary casts to void* as this is allowed to be implicit (for better or worse).
Most of the remaining reinterpret_casts are for converting to/from raw bytes of objects. We could consider better idioms for these patterns in follow-up work.
I wish there were a way to implement a template variant of static_cast that would only compile if no pointer arithmetic is generated, but best I can tell, this is not possible. AFAIK the best you could do is a dynamic check that the void* conversion after the static cast is unchanged.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12308
Test Plan: existing tests, CI
Reviewed By: ltamasi
Differential Revision: D53204947
Pulled By: pdillinger
fbshipit-source-id: 9de23e618263b0d5b9820f4e15966876888a16e2
2024-02-07 18:44:11 +00:00
|
|
|
[&](void* arg) { versions.push_back(static_cast<Version*>(arg)); });
|
2020-10-24 05:48:00 +00:00
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
2020-12-23 07:44:44 +00:00
|
|
|
ASSERT_OK(LogAndApplyToDefaultCF(edits));
|
2020-10-24 05:48:00 +00:00
|
|
|
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
SyncPoint::GetInstance()->ClearAllCallBacks();
|
|
|
|
|
|
|
|
// Since the edits are all WAL edits, no version should be created.
|
|
|
|
ASSERT_EQ(versions.size(), 1);
|
|
|
|
ASSERT_NE(versions[0], nullptr);
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(VersionSetTest, WalAddition) {
|
|
|
|
NewDB();
|
|
|
|
|
|
|
|
constexpr WalNumber kLogNumber = 10;
|
|
|
|
constexpr uint64_t kSizeInBytes = 111;
|
|
|
|
|
|
|
|
// A WAL is just created.
|
|
|
|
{
|
|
|
|
VersionEdit edit;
|
|
|
|
edit.AddWal(kLogNumber);
|
|
|
|
|
|
|
|
ASSERT_OK(LogAndApplyToDefaultCF(edit));
|
|
|
|
|
|
|
|
const auto& wals = versions_->GetWalSet().GetWals();
|
|
|
|
ASSERT_EQ(wals.size(), 1);
|
|
|
|
ASSERT_TRUE(wals.find(kLogNumber) != wals.end());
|
|
|
|
ASSERT_FALSE(wals.at(kLogNumber).HasSyncedSize());
|
|
|
|
}
|
|
|
|
|
|
|
|
// The WAL is synced for several times before closing.
|
|
|
|
{
|
|
|
|
for (uint64_t size_delta = 100; size_delta > 0; size_delta /= 2) {
|
|
|
|
uint64_t size = kSizeInBytes - size_delta;
|
|
|
|
WalMetadata wal(size);
|
|
|
|
VersionEdit edit;
|
|
|
|
edit.AddWal(kLogNumber, wal);
|
|
|
|
|
|
|
|
ASSERT_OK(LogAndApplyToDefaultCF(edit));
|
|
|
|
|
|
|
|
const auto& wals = versions_->GetWalSet().GetWals();
|
|
|
|
ASSERT_EQ(wals.size(), 1);
|
|
|
|
ASSERT_TRUE(wals.find(kLogNumber) != wals.end());
|
|
|
|
ASSERT_TRUE(wals.at(kLogNumber).HasSyncedSize());
|
|
|
|
ASSERT_EQ(wals.at(kLogNumber).GetSyncedSizeInBytes(), size);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// The WAL is closed.
|
|
|
|
{
|
|
|
|
WalMetadata wal(kSizeInBytes);
|
|
|
|
VersionEdit edit;
|
|
|
|
edit.AddWal(kLogNumber, wal);
|
|
|
|
|
|
|
|
ASSERT_OK(LogAndApplyToDefaultCF(edit));
|
|
|
|
|
|
|
|
const auto& wals = versions_->GetWalSet().GetWals();
|
|
|
|
ASSERT_EQ(wals.size(), 1);
|
|
|
|
ASSERT_TRUE(wals.find(kLogNumber) != wals.end());
|
|
|
|
ASSERT_TRUE(wals.at(kLogNumber).HasSyncedSize());
|
|
|
|
ASSERT_EQ(wals.at(kLogNumber).GetSyncedSizeInBytes(), kSizeInBytes);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Recover a new VersionSet.
|
|
|
|
{
|
2023-10-27 22:56:48 +00:00
|
|
|
std::unique_ptr<VersionSet> new_versions(new VersionSet(
|
|
|
|
dbname_, &db_options_, env_options_, table_cache_.get(),
|
|
|
|
&write_buffer_manager_, &write_controller_,
|
|
|
|
/*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
|
2023-11-11 16:11:11 +00:00
|
|
|
/*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"",
|
2023-12-29 02:25:29 +00:00
|
|
|
/*error_handler=*/nullptr, /*read_only=*/false));
|
2020-10-24 05:48:00 +00:00
|
|
|
ASSERT_OK(new_versions->Recover(column_families_, /*read_only=*/false));
|
|
|
|
const auto& wals = new_versions->GetWalSet().GetWals();
|
|
|
|
ASSERT_EQ(wals.size(), 1);
|
|
|
|
ASSERT_TRUE(wals.find(kLogNumber) != wals.end());
|
|
|
|
ASSERT_TRUE(wals.at(kLogNumber).HasSyncedSize());
|
|
|
|
ASSERT_EQ(wals.at(kLogNumber).GetSyncedSizeInBytes(), kSizeInBytes);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(VersionSetTest, WalCloseWithoutSync) {
|
|
|
|
NewDB();
|
|
|
|
|
|
|
|
constexpr WalNumber kLogNumber = 10;
|
|
|
|
constexpr uint64_t kSizeInBytes = 111;
|
|
|
|
constexpr uint64_t kSyncedSizeInBytes = kSizeInBytes / 2;
|
|
|
|
|
|
|
|
// A WAL is just created.
|
|
|
|
{
|
|
|
|
VersionEdit edit;
|
|
|
|
edit.AddWal(kLogNumber);
|
|
|
|
|
|
|
|
ASSERT_OK(LogAndApplyToDefaultCF(edit));
|
|
|
|
|
|
|
|
const auto& wals = versions_->GetWalSet().GetWals();
|
|
|
|
ASSERT_EQ(wals.size(), 1);
|
|
|
|
ASSERT_TRUE(wals.find(kLogNumber) != wals.end());
|
|
|
|
ASSERT_FALSE(wals.at(kLogNumber).HasSyncedSize());
|
|
|
|
}
|
|
|
|
|
|
|
|
// The WAL is synced before closing.
|
|
|
|
{
|
|
|
|
WalMetadata wal(kSyncedSizeInBytes);
|
|
|
|
VersionEdit edit;
|
|
|
|
edit.AddWal(kLogNumber, wal);
|
|
|
|
|
|
|
|
ASSERT_OK(LogAndApplyToDefaultCF(edit));
|
|
|
|
|
|
|
|
const auto& wals = versions_->GetWalSet().GetWals();
|
|
|
|
ASSERT_EQ(wals.size(), 1);
|
|
|
|
ASSERT_TRUE(wals.find(kLogNumber) != wals.end());
|
|
|
|
ASSERT_TRUE(wals.at(kLogNumber).HasSyncedSize());
|
|
|
|
ASSERT_EQ(wals.at(kLogNumber).GetSyncedSizeInBytes(), kSyncedSizeInBytes);
|
|
|
|
}
|
|
|
|
|
|
|
|
// A new WAL with larger log number is created,
|
|
|
|
// implicitly marking the current WAL closed.
|
|
|
|
{
|
|
|
|
VersionEdit edit;
|
|
|
|
edit.AddWal(kLogNumber + 1);
|
|
|
|
ASSERT_OK(LogAndApplyToDefaultCF(edit));
|
|
|
|
|
|
|
|
const auto& wals = versions_->GetWalSet().GetWals();
|
|
|
|
ASSERT_EQ(wals.size(), 2);
|
|
|
|
ASSERT_TRUE(wals.find(kLogNumber) != wals.end());
|
|
|
|
ASSERT_TRUE(wals.at(kLogNumber).HasSyncedSize());
|
|
|
|
ASSERT_EQ(wals.at(kLogNumber).GetSyncedSizeInBytes(), kSyncedSizeInBytes);
|
|
|
|
ASSERT_TRUE(wals.find(kLogNumber + 1) != wals.end());
|
|
|
|
ASSERT_FALSE(wals.at(kLogNumber + 1).HasSyncedSize());
|
|
|
|
}
|
|
|
|
|
|
|
|
// Recover a new VersionSet.
|
|
|
|
{
|
2023-10-27 22:56:48 +00:00
|
|
|
std::unique_ptr<VersionSet> new_versions(new VersionSet(
|
|
|
|
dbname_, &db_options_, env_options_, table_cache_.get(),
|
|
|
|
&write_buffer_manager_, &write_controller_,
|
|
|
|
/*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
|
2023-11-11 16:11:11 +00:00
|
|
|
/*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"",
|
2023-12-29 02:25:29 +00:00
|
|
|
/*error_handler=*/nullptr, /*read_only=*/false));
|
2020-10-24 05:48:00 +00:00
|
|
|
ASSERT_OK(new_versions->Recover(column_families_, false));
|
|
|
|
const auto& wals = new_versions->GetWalSet().GetWals();
|
|
|
|
ASSERT_EQ(wals.size(), 2);
|
|
|
|
ASSERT_TRUE(wals.find(kLogNumber) != wals.end());
|
|
|
|
ASSERT_TRUE(wals.at(kLogNumber).HasSyncedSize());
|
|
|
|
ASSERT_EQ(wals.at(kLogNumber).GetSyncedSizeInBytes(), kSyncedSizeInBytes);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(VersionSetTest, WalDeletion) {
|
|
|
|
NewDB();
|
|
|
|
|
|
|
|
constexpr WalNumber kClosedLogNumber = 10;
|
|
|
|
constexpr WalNumber kNonClosedLogNumber = 20;
|
|
|
|
constexpr uint64_t kSizeInBytes = 111;
|
|
|
|
|
|
|
|
// Add a non-closed and a closed WAL.
|
|
|
|
{
|
|
|
|
VersionEdit edit;
|
|
|
|
edit.AddWal(kClosedLogNumber, WalMetadata(kSizeInBytes));
|
|
|
|
edit.AddWal(kNonClosedLogNumber);
|
|
|
|
|
|
|
|
ASSERT_OK(LogAndApplyToDefaultCF(edit));
|
|
|
|
|
|
|
|
const auto& wals = versions_->GetWalSet().GetWals();
|
|
|
|
ASSERT_EQ(wals.size(), 2);
|
|
|
|
ASSERT_TRUE(wals.find(kNonClosedLogNumber) != wals.end());
|
|
|
|
ASSERT_TRUE(wals.find(kClosedLogNumber) != wals.end());
|
|
|
|
ASSERT_FALSE(wals.at(kNonClosedLogNumber).HasSyncedSize());
|
|
|
|
ASSERT_TRUE(wals.at(kClosedLogNumber).HasSyncedSize());
|
|
|
|
ASSERT_EQ(wals.at(kClosedLogNumber).GetSyncedSizeInBytes(), kSizeInBytes);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Delete the closed WAL.
|
|
|
|
{
|
|
|
|
VersionEdit edit;
|
2020-11-07 00:30:44 +00:00
|
|
|
edit.DeleteWalsBefore(kNonClosedLogNumber);
|
2020-10-24 05:48:00 +00:00
|
|
|
|
|
|
|
ASSERT_OK(LogAndApplyToDefaultCF(edit));
|
|
|
|
|
|
|
|
const auto& wals = versions_->GetWalSet().GetWals();
|
|
|
|
ASSERT_EQ(wals.size(), 1);
|
|
|
|
ASSERT_TRUE(wals.find(kNonClosedLogNumber) != wals.end());
|
|
|
|
ASSERT_FALSE(wals.at(kNonClosedLogNumber).HasSyncedSize());
|
|
|
|
}
|
|
|
|
|
|
|
|
// Recover a new VersionSet, only the non-closed WAL should show up.
|
|
|
|
{
|
2023-10-27 22:56:48 +00:00
|
|
|
std::unique_ptr<VersionSet> new_versions(new VersionSet(
|
|
|
|
dbname_, &db_options_, env_options_, table_cache_.get(),
|
|
|
|
&write_buffer_manager_, &write_controller_,
|
|
|
|
/*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
|
2023-11-11 16:11:11 +00:00
|
|
|
/*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"",
|
2023-12-29 02:25:29 +00:00
|
|
|
/*error_handler=*/nullptr, /*read_only=*/false));
|
2020-10-24 05:48:00 +00:00
|
|
|
ASSERT_OK(new_versions->Recover(column_families_, false));
|
|
|
|
const auto& wals = new_versions->GetWalSet().GetWals();
|
|
|
|
ASSERT_EQ(wals.size(), 1);
|
|
|
|
ASSERT_TRUE(wals.find(kNonClosedLogNumber) != wals.end());
|
|
|
|
ASSERT_FALSE(wals.at(kNonClosedLogNumber).HasSyncedSize());
|
|
|
|
}
|
|
|
|
|
|
|
|
// Force the creation of a new MANIFEST file,
|
|
|
|
// only the non-closed WAL should be written to the new MANIFEST.
|
|
|
|
{
|
|
|
|
std::vector<WalAddition> wal_additions;
|
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"VersionSet::WriteCurrentStateToManifest:SaveWal", [&](void* arg) {
|
Prefer static_cast in place of most reinterpret_cast (#12308)
Summary:
The following are risks associated with pointer-to-pointer reinterpret_cast:
* Can produce the "wrong result" (crash or memory corruption). IIRC, in theory this can happen for any up-cast or down-cast for a non-standard-layout type, though in practice would only happen for multiple inheritance cases (where the base class pointer might be "inside" the derived object). We don't use multiple inheritance a lot, but we do.
* Can mask useful compiler errors upon code change, including converting between unrelated pointer types that you are expecting to be related, and converting between pointer and scalar types unintentionally.
I can only think of some obscure cases where static_cast could be troublesome when it compiles as a replacement:
* Going through `void*` could plausibly cause unnecessary or broken pointer arithmetic. Suppose we have
`struct Derived: public Base1, public Base2`. If we have `Derived*` -> `void*` -> `Base2*` -> `Derived*` through reinterpret casts, this could plausibly work (though technical UB) assuming the `Base2*` is not dereferenced. Changing to static cast could introduce breaking pointer arithmetic.
* Unnecessary (but safe) pointer arithmetic could arise in a case like `Derived*` -> `Base2*` -> `Derived*` where before the Base2 pointer might not have been dereferenced. This could potentially affect performance.
With some light scripting, I tried replacing pointer-to-pointer reinterpret_casts with static_cast and kept the cases that still compile. Most occurrences of reinterpret_cast have successfully been changed (except for java/ and third-party/). 294 changed, 257 remain.
A couple of related interventions included here:
* Previously Cache::Handle was not actually derived from in the implementations and just used as a `void*` stand-in with reinterpret_cast. Now there is a relationship to allow static_cast. In theory, this could introduce pointer arithmetic (as described above) but is unlikely without multiple inheritance AND non-empty Cache::Handle.
* Remove some unnecessary casts to void* as this is allowed to be implicit (for better or worse).
Most of the remaining reinterpret_casts are for converting to/from raw bytes of objects. We could consider better idioms for these patterns in follow-up work.
I wish there were a way to implement a template variant of static_cast that would only compile if no pointer arithmetic is generated, but best I can tell, this is not possible. AFAIK the best you could do is a dynamic check that the void* conversion after the static cast is unchanged.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12308
Test Plan: existing tests, CI
Reviewed By: ltamasi
Differential Revision: D53204947
Pulled By: pdillinger
fbshipit-source-id: 9de23e618263b0d5b9820f4e15966876888a16e2
2024-02-07 18:44:11 +00:00
|
|
|
VersionEdit* edit = static_cast<VersionEdit*>(arg);
|
2020-10-24 05:48:00 +00:00
|
|
|
ASSERT_TRUE(edit->IsWalAddition());
|
|
|
|
for (auto& addition : edit->GetWalAdditions()) {
|
|
|
|
wal_additions.push_back(addition);
|
|
|
|
}
|
|
|
|
});
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
|
|
|
|
CreateNewManifest();
|
|
|
|
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
SyncPoint::GetInstance()->ClearAllCallBacks();
|
|
|
|
|
|
|
|
ASSERT_EQ(wal_additions.size(), 1);
|
|
|
|
ASSERT_EQ(wal_additions[0].GetLogNumber(), kNonClosedLogNumber);
|
|
|
|
ASSERT_FALSE(wal_additions[0].GetMetadata().HasSyncedSize());
|
|
|
|
}
|
|
|
|
|
|
|
|
// Recover from the new MANIFEST, only the non-closed WAL should show up.
|
|
|
|
{
|
2023-10-27 22:56:48 +00:00
|
|
|
std::unique_ptr<VersionSet> new_versions(new VersionSet(
|
|
|
|
dbname_, &db_options_, env_options_, table_cache_.get(),
|
|
|
|
&write_buffer_manager_, &write_controller_,
|
|
|
|
/*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
|
2023-11-11 16:11:11 +00:00
|
|
|
/*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"",
|
2023-12-29 02:25:29 +00:00
|
|
|
/*error_handler=*/nullptr, /*read_only=*/false));
|
2020-10-24 05:48:00 +00:00
|
|
|
ASSERT_OK(new_versions->Recover(column_families_, false));
|
|
|
|
const auto& wals = new_versions->GetWalSet().GetWals();
|
|
|
|
ASSERT_EQ(wals.size(), 1);
|
|
|
|
ASSERT_TRUE(wals.find(kNonClosedLogNumber) != wals.end());
|
|
|
|
ASSERT_FALSE(wals.at(kNonClosedLogNumber).HasSyncedSize());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(VersionSetTest, WalCreateTwice) {
|
|
|
|
NewDB();
|
|
|
|
|
|
|
|
constexpr WalNumber kLogNumber = 10;
|
|
|
|
|
|
|
|
VersionEdit edit;
|
|
|
|
edit.AddWal(kLogNumber);
|
|
|
|
|
|
|
|
ASSERT_OK(LogAndApplyToDefaultCF(edit));
|
|
|
|
|
|
|
|
Status s = LogAndApplyToDefaultCF(edit);
|
|
|
|
ASSERT_TRUE(s.IsCorruption());
|
|
|
|
ASSERT_TRUE(s.ToString().find("WAL 10 is created more than once") !=
|
|
|
|
std::string::npos)
|
|
|
|
<< s.ToString();
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(VersionSetTest, WalCreateAfterClose) {
|
|
|
|
NewDB();
|
|
|
|
|
|
|
|
constexpr WalNumber kLogNumber = 10;
|
|
|
|
constexpr uint64_t kSizeInBytes = 111;
|
|
|
|
|
|
|
|
{
|
|
|
|
// Add a closed WAL.
|
|
|
|
VersionEdit edit;
|
|
|
|
edit.AddWal(kLogNumber);
|
|
|
|
WalMetadata wal(kSizeInBytes);
|
|
|
|
edit.AddWal(kLogNumber, wal);
|
|
|
|
|
|
|
|
ASSERT_OK(LogAndApplyToDefaultCF(edit));
|
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
// Create the same WAL again.
|
|
|
|
VersionEdit edit;
|
|
|
|
edit.AddWal(kLogNumber);
|
|
|
|
|
|
|
|
Status s = LogAndApplyToDefaultCF(edit);
|
|
|
|
ASSERT_TRUE(s.IsCorruption());
|
|
|
|
ASSERT_TRUE(s.ToString().find("WAL 10 is created more than once") !=
|
|
|
|
std::string::npos)
|
|
|
|
<< s.ToString();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(VersionSetTest, AddWalWithSmallerSize) {
|
|
|
|
NewDB();
|
2022-07-21 20:35:36 +00:00
|
|
|
assert(versions_);
|
2020-10-24 05:48:00 +00:00
|
|
|
|
|
|
|
constexpr WalNumber kLogNumber = 10;
|
|
|
|
constexpr uint64_t kSizeInBytes = 111;
|
|
|
|
|
|
|
|
{
|
|
|
|
// Add a closed WAL.
|
|
|
|
VersionEdit edit;
|
|
|
|
WalMetadata wal(kSizeInBytes);
|
|
|
|
edit.AddWal(kLogNumber, wal);
|
|
|
|
|
|
|
|
ASSERT_OK(LogAndApplyToDefaultCF(edit));
|
|
|
|
}
|
2022-07-21 20:35:36 +00:00
|
|
|
// Copy for future comparison.
|
|
|
|
const std::map<WalNumber, WalMetadata> wals1 =
|
|
|
|
versions_->GetWalSet().GetWals();
|
2020-10-24 05:48:00 +00:00
|
|
|
|
|
|
|
{
|
|
|
|
// Add the same WAL with smaller synced size.
|
|
|
|
VersionEdit edit;
|
|
|
|
WalMetadata wal(kSizeInBytes / 2);
|
|
|
|
edit.AddWal(kLogNumber, wal);
|
|
|
|
|
|
|
|
Status s = LogAndApplyToDefaultCF(edit);
|
2022-07-21 20:35:36 +00:00
|
|
|
ASSERT_OK(s);
|
2020-10-24 05:48:00 +00:00
|
|
|
}
|
2022-07-21 20:35:36 +00:00
|
|
|
const std::map<WalNumber, WalMetadata> wals2 =
|
|
|
|
versions_->GetWalSet().GetWals();
|
|
|
|
ASSERT_EQ(wals1, wals2);
|
2020-10-24 05:48:00 +00:00
|
|
|
}
|
|
|
|
|
2020-11-07 00:30:44 +00:00
|
|
|
TEST_F(VersionSetTest, DeleteWalsBeforeNonExistingWalNumber) {
|
2020-10-24 05:48:00 +00:00
|
|
|
NewDB();
|
|
|
|
|
2020-11-07 00:30:44 +00:00
|
|
|
constexpr WalNumber kLogNumber0 = 10;
|
|
|
|
constexpr WalNumber kLogNumber1 = 20;
|
|
|
|
constexpr WalNumber kNonExistingNumber = 15;
|
|
|
|
constexpr uint64_t kSizeInBytes = 111;
|
|
|
|
|
|
|
|
{
|
|
|
|
// Add closed WALs.
|
|
|
|
VersionEdit edit;
|
|
|
|
WalMetadata wal(kSizeInBytes);
|
|
|
|
edit.AddWal(kLogNumber0, wal);
|
|
|
|
edit.AddWal(kLogNumber1, wal);
|
|
|
|
|
|
|
|
ASSERT_OK(LogAndApplyToDefaultCF(edit));
|
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
// Delete WALs before a non-existing WAL.
|
|
|
|
VersionEdit edit;
|
|
|
|
edit.DeleteWalsBefore(kNonExistingNumber);
|
|
|
|
|
|
|
|
ASSERT_OK(LogAndApplyToDefaultCF(edit));
|
|
|
|
}
|
|
|
|
|
|
|
|
// Recover a new VersionSet, WAL0 is deleted, WAL1 is not.
|
|
|
|
{
|
2023-10-27 22:56:48 +00:00
|
|
|
std::unique_ptr<VersionSet> new_versions(new VersionSet(
|
|
|
|
dbname_, &db_options_, env_options_, table_cache_.get(),
|
|
|
|
&write_buffer_manager_, &write_controller_,
|
|
|
|
/*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
|
2023-11-11 16:11:11 +00:00
|
|
|
/*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"",
|
2023-12-29 02:25:29 +00:00
|
|
|
/*error_handler=*/nullptr, /*read_only=*/false));
|
2020-11-07 00:30:44 +00:00
|
|
|
ASSERT_OK(new_versions->Recover(column_families_, false));
|
|
|
|
const auto& wals = new_versions->GetWalSet().GetWals();
|
|
|
|
ASSERT_EQ(wals.size(), 1);
|
|
|
|
ASSERT_TRUE(wals.find(kLogNumber1) != wals.end());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(VersionSetTest, DeleteAllWals) {
|
|
|
|
NewDB();
|
|
|
|
|
|
|
|
constexpr WalNumber kMaxLogNumber = 10;
|
2020-10-24 05:48:00 +00:00
|
|
|
constexpr uint64_t kSizeInBytes = 111;
|
|
|
|
|
|
|
|
{
|
|
|
|
// Add a closed WAL.
|
|
|
|
VersionEdit edit;
|
|
|
|
WalMetadata wal(kSizeInBytes);
|
2020-11-07 00:30:44 +00:00
|
|
|
edit.AddWal(kMaxLogNumber, wal);
|
2020-10-24 05:48:00 +00:00
|
|
|
|
|
|
|
ASSERT_OK(LogAndApplyToDefaultCF(edit));
|
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
VersionEdit edit;
|
2020-11-07 00:30:44 +00:00
|
|
|
edit.DeleteWalsBefore(kMaxLogNumber + 10);
|
2020-10-24 05:48:00 +00:00
|
|
|
|
2020-11-07 00:30:44 +00:00
|
|
|
ASSERT_OK(LogAndApplyToDefaultCF(edit));
|
|
|
|
}
|
|
|
|
|
|
|
|
// Recover a new VersionSet, all WALs are deleted.
|
|
|
|
{
|
2023-10-27 22:56:48 +00:00
|
|
|
std::unique_ptr<VersionSet> new_versions(new VersionSet(
|
|
|
|
dbname_, &db_options_, env_options_, table_cache_.get(),
|
|
|
|
&write_buffer_manager_, &write_controller_,
|
|
|
|
/*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
|
2023-11-11 16:11:11 +00:00
|
|
|
/*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"",
|
2023-12-29 02:25:29 +00:00
|
|
|
/*error_handler=*/nullptr, /*read_only=*/false));
|
2020-11-07 00:30:44 +00:00
|
|
|
ASSERT_OK(new_versions->Recover(column_families_, false));
|
|
|
|
const auto& wals = new_versions->GetWalSet().GetWals();
|
|
|
|
ASSERT_EQ(wals.size(), 0);
|
2020-10-24 05:48:00 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(VersionSetTest, AtomicGroupWithWalEdits) {
|
|
|
|
NewDB();
|
|
|
|
|
2020-11-07 00:30:44 +00:00
|
|
|
constexpr int kAtomicGroupSize = 7;
|
2020-10-24 05:48:00 +00:00
|
|
|
constexpr uint64_t kNumWals = 5;
|
|
|
|
const std::string kDBId = "db_db";
|
|
|
|
|
|
|
|
int remaining = kAtomicGroupSize;
|
|
|
|
autovector<std::unique_ptr<VersionEdit>> edits;
|
|
|
|
// Add 5 WALs.
|
|
|
|
for (uint64_t i = 1; i <= kNumWals; i++) {
|
|
|
|
edits.emplace_back(new VersionEdit);
|
|
|
|
// WAL's size equals its log number.
|
|
|
|
edits.back()->AddWal(i, WalMetadata(i));
|
|
|
|
edits.back()->MarkAtomicGroup(--remaining);
|
|
|
|
}
|
|
|
|
// One edit with the min log number set.
|
|
|
|
edits.emplace_back(new VersionEdit);
|
|
|
|
edits.back()->SetDBId(kDBId);
|
|
|
|
edits.back()->MarkAtomicGroup(--remaining);
|
|
|
|
// Delete the first added 4 WALs.
|
2020-11-07 00:30:44 +00:00
|
|
|
edits.emplace_back(new VersionEdit);
|
|
|
|
edits.back()->DeleteWalsBefore(kNumWals);
|
|
|
|
edits.back()->MarkAtomicGroup(--remaining);
|
2020-10-24 05:48:00 +00:00
|
|
|
ASSERT_EQ(remaining, 0);
|
|
|
|
|
2020-12-23 07:44:44 +00:00
|
|
|
ASSERT_OK(LogAndApplyToDefaultCF(edits));
|
2020-10-24 05:48:00 +00:00
|
|
|
|
|
|
|
// Recover a new VersionSet, the min log number and the last WAL should be
|
|
|
|
// kept.
|
|
|
|
{
|
2023-10-27 22:56:48 +00:00
|
|
|
std::unique_ptr<VersionSet> new_versions(new VersionSet(
|
|
|
|
dbname_, &db_options_, env_options_, table_cache_.get(),
|
|
|
|
&write_buffer_manager_, &write_controller_,
|
|
|
|
/*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
|
2023-11-11 16:11:11 +00:00
|
|
|
/*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"",
|
2023-12-29 02:25:29 +00:00
|
|
|
/*error_handler=*/nullptr, /*read_only=*/false));
|
2020-10-24 05:48:00 +00:00
|
|
|
std::string db_id;
|
|
|
|
ASSERT_OK(
|
|
|
|
new_versions->Recover(column_families_, /*read_only=*/false, &db_id));
|
|
|
|
|
|
|
|
ASSERT_EQ(db_id, kDBId);
|
|
|
|
|
|
|
|
const auto& wals = new_versions->GetWalSet().GetWals();
|
|
|
|
ASSERT_EQ(wals.size(), 1);
|
|
|
|
ASSERT_TRUE(wals.find(kNumWals) != wals.end());
|
|
|
|
ASSERT_TRUE(wals.at(kNumWals).HasSyncedSize());
|
|
|
|
ASSERT_EQ(wals.at(kNumWals).GetSyncedSizeInBytes(), kNumWals);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-10-27 22:56:48 +00:00
|
|
|
TEST_F(VersionSetTest, OffpeakTimeInfoTest) {
|
|
|
|
Random rnd(test::RandomSeed());
|
|
|
|
|
|
|
|
// Sets off-peak time from 11:30PM to 4:30AM next day.
|
|
|
|
// Starting at 1:30PM, use mock sleep to make time pass
|
|
|
|
// and see if IsNowOffpeak() returns correctly per time changes
|
|
|
|
int now_hour = 13;
|
|
|
|
int now_minute = 30;
|
2023-11-06 19:43:59 +00:00
|
|
|
versions_->ChangeOffpeakTimeOption("23:30-04:30");
|
2023-10-27 22:56:48 +00:00
|
|
|
|
|
|
|
auto mock_clock = std::make_shared<MockSystemClock>(env_->GetSystemClock());
|
|
|
|
// Add some extra random days to current time
|
|
|
|
int days = rnd.Uniform(100);
|
|
|
|
mock_clock->SetCurrentTime(days * 86400 + now_hour * 3600 + now_minute * 60);
|
2023-11-06 19:43:59 +00:00
|
|
|
int64_t now;
|
|
|
|
ASSERT_OK(mock_clock.get()->GetCurrentTime(&now));
|
2023-10-27 22:56:48 +00:00
|
|
|
|
|
|
|
// Starting at 1:30PM. It's not off-peak
|
2023-11-06 19:43:59 +00:00
|
|
|
ASSERT_FALSE(
|
|
|
|
versions_->offpeak_time_option().GetOffpeakTimeInfo(now).is_now_offpeak);
|
2023-10-27 22:56:48 +00:00
|
|
|
|
|
|
|
// Now it's at 4:30PM. Still not off-peak
|
|
|
|
mock_clock->MockSleepForSeconds(3 * 3600);
|
2023-11-06 19:43:59 +00:00
|
|
|
ASSERT_OK(mock_clock.get()->GetCurrentTime(&now));
|
|
|
|
ASSERT_FALSE(
|
|
|
|
versions_->offpeak_time_option().GetOffpeakTimeInfo(now).is_now_offpeak);
|
2023-10-27 22:56:48 +00:00
|
|
|
|
|
|
|
// Now it's at 11:30PM. It's off-peak
|
|
|
|
mock_clock->MockSleepForSeconds(7 * 3600);
|
2023-11-06 19:43:59 +00:00
|
|
|
ASSERT_OK(mock_clock.get()->GetCurrentTime(&now));
|
|
|
|
ASSERT_TRUE(
|
|
|
|
versions_->offpeak_time_option().GetOffpeakTimeInfo(now).is_now_offpeak);
|
2023-10-27 22:56:48 +00:00
|
|
|
|
|
|
|
// Now it's at 2:30AM next day. It's still off-peak
|
|
|
|
mock_clock->MockSleepForSeconds(3 * 3600);
|
2023-11-06 19:43:59 +00:00
|
|
|
ASSERT_OK(mock_clock.get()->GetCurrentTime(&now));
|
|
|
|
ASSERT_TRUE(
|
|
|
|
versions_->offpeak_time_option().GetOffpeakTimeInfo(now).is_now_offpeak);
|
2023-10-27 22:56:48 +00:00
|
|
|
|
|
|
|
// Now it's at 4:30AM. It's still off-peak
|
|
|
|
mock_clock->MockSleepForSeconds(2 * 3600);
|
2023-11-06 19:43:59 +00:00
|
|
|
ASSERT_OK(mock_clock.get()->GetCurrentTime(&now));
|
|
|
|
ASSERT_TRUE(
|
|
|
|
versions_->offpeak_time_option().GetOffpeakTimeInfo(now).is_now_offpeak);
|
2023-10-27 22:56:48 +00:00
|
|
|
|
|
|
|
// Sleep for one more minute. It's at 4:31AM It's no longer off-peak
|
|
|
|
mock_clock->MockSleepForSeconds(60);
|
2023-11-06 19:43:59 +00:00
|
|
|
ASSERT_OK(mock_clock.get()->GetCurrentTime(&now));
|
|
|
|
ASSERT_FALSE(
|
|
|
|
versions_->offpeak_time_option().GetOffpeakTimeInfo(now).is_now_offpeak);
|
2023-10-27 22:56:48 +00:00
|
|
|
|
|
|
|
// Entire day offpeak
|
2023-11-06 19:43:59 +00:00
|
|
|
versions_->ChangeOffpeakTimeOption("00:00-23:59");
|
2023-10-27 22:56:48 +00:00
|
|
|
// It doesn't matter what time it is. It should be just offpeak.
|
2023-11-06 19:43:59 +00:00
|
|
|
ASSERT_TRUE(
|
|
|
|
versions_->offpeak_time_option().GetOffpeakTimeInfo(now).is_now_offpeak);
|
2023-10-27 22:56:48 +00:00
|
|
|
|
|
|
|
// Mock Sleep for 3 hours. It's still off-peak
|
|
|
|
mock_clock->MockSleepForSeconds(3 * 3600);
|
2023-11-06 19:43:59 +00:00
|
|
|
ASSERT_OK(mock_clock.get()->GetCurrentTime(&now));
|
|
|
|
ASSERT_TRUE(
|
|
|
|
versions_->offpeak_time_option().GetOffpeakTimeInfo(now).is_now_offpeak);
|
2023-10-27 22:56:48 +00:00
|
|
|
|
|
|
|
// Mock Sleep for 20 hours. It's still off-peak
|
|
|
|
mock_clock->MockSleepForSeconds(20 * 3600);
|
2023-11-06 19:43:59 +00:00
|
|
|
ASSERT_OK(mock_clock.get()->GetCurrentTime(&now));
|
|
|
|
ASSERT_TRUE(
|
|
|
|
versions_->offpeak_time_option().GetOffpeakTimeInfo(now).is_now_offpeak);
|
2023-10-27 22:56:48 +00:00
|
|
|
|
|
|
|
// Mock Sleep for 59 minutes. It's still off-peak
|
|
|
|
mock_clock->MockSleepForSeconds(59 * 60);
|
2023-11-06 19:43:59 +00:00
|
|
|
ASSERT_OK(mock_clock.get()->GetCurrentTime(&now));
|
|
|
|
ASSERT_TRUE(
|
|
|
|
versions_->offpeak_time_option().GetOffpeakTimeInfo(now).is_now_offpeak);
|
2023-10-27 22:56:48 +00:00
|
|
|
|
|
|
|
// Mock Sleep for 59 seconds. It's still off-peak
|
|
|
|
mock_clock->MockSleepForSeconds(59);
|
2023-11-06 19:43:59 +00:00
|
|
|
ASSERT_OK(mock_clock.get()->GetCurrentTime(&now));
|
|
|
|
ASSERT_TRUE(
|
|
|
|
versions_->offpeak_time_option().GetOffpeakTimeInfo(now).is_now_offpeak);
|
2023-10-27 22:56:48 +00:00
|
|
|
|
|
|
|
// Mock Sleep for 1 second (exactly 24h passed). It's still off-peak
|
|
|
|
mock_clock->MockSleepForSeconds(1);
|
2023-11-06 19:43:59 +00:00
|
|
|
ASSERT_OK(mock_clock.get()->GetCurrentTime(&now));
|
|
|
|
ASSERT_TRUE(
|
|
|
|
versions_->offpeak_time_option().GetOffpeakTimeInfo(now).is_now_offpeak);
|
2023-10-27 22:56:48 +00:00
|
|
|
// Another second for sanity check
|
|
|
|
mock_clock->MockSleepForSeconds(1);
|
2023-11-06 19:43:59 +00:00
|
|
|
ASSERT_OK(mock_clock.get()->GetCurrentTime(&now));
|
|
|
|
ASSERT_TRUE(
|
|
|
|
versions_->offpeak_time_option().GetOffpeakTimeInfo(now).is_now_offpeak);
|
2023-10-27 22:56:48 +00:00
|
|
|
}
|
|
|
|
|
2023-12-29 02:25:29 +00:00
|
|
|
TEST_F(VersionSetTest, ManifestTruncateAfterClose) {
|
|
|
|
std::string manifest_path;
|
|
|
|
VersionEdit edit;
|
|
|
|
|
|
|
|
NewDB();
|
|
|
|
ASSERT_OK(LogAndApplyToDefaultCF(edit));
|
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"VersionSet::Close:AfterClose", [&](void*) {
|
|
|
|
GetManifestPath(&manifest_path);
|
|
|
|
std::unique_ptr<WritableFile> manifest_file;
|
|
|
|
EXPECT_OK(env_->ReopenWritableFile(manifest_path, &manifest_file,
|
|
|
|
EnvOptions()));
|
|
|
|
EXPECT_OK(manifest_file->Truncate(0));
|
|
|
|
EXPECT_OK(manifest_file->Close());
|
|
|
|
});
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
CloseDB();
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
|
|
|
|
ReopenDB();
|
|
|
|
}
|
|
|
|
|
2022-12-29 21:28:24 +00:00
|
|
|
TEST_F(VersionStorageInfoTest, AddRangeDeletionCompensatedFileSize) {
|
|
|
|
// Tests that compensated range deletion size is added to compensated file
|
|
|
|
// size.
|
|
|
|
Add(4, 100U, "1", "2", 100U, kInvalidBlobFileNumber, 1000U);
|
|
|
|
|
|
|
|
UpdateVersionStorageInfo();
|
|
|
|
|
|
|
|
auto meta = vstorage_.GetFileMetaDataByNumber(100U);
|
|
|
|
ASSERT_EQ(meta->compensated_file_size, 100U + 1000U);
|
|
|
|
}
|
|
|
|
|
2020-12-05 22:17:11 +00:00
|
|
|
class VersionSetWithTimestampTest : public VersionSetTest {
|
|
|
|
public:
|
|
|
|
static const std::string kNewCfName;
|
|
|
|
|
|
|
|
explicit VersionSetWithTimestampTest() : VersionSetTest() {}
|
|
|
|
|
|
|
|
void SetUp() override {
|
|
|
|
NewDB();
|
|
|
|
Options options;
|
2022-02-08 20:14:25 +00:00
|
|
|
options.comparator = test::BytewiseComparatorWithU64TsWrapper();
|
2020-12-05 22:17:11 +00:00
|
|
|
cfd_ = CreateColumnFamily(kNewCfName, options);
|
|
|
|
EXPECT_NE(nullptr, cfd_);
|
|
|
|
EXPECT_NE(nullptr, cfd_->GetLatestMutableCFOptions());
|
|
|
|
column_families_.emplace_back(kNewCfName, options);
|
|
|
|
}
|
|
|
|
|
|
|
|
void TearDown() override {
|
|
|
|
for (auto* e : edits_) {
|
|
|
|
delete e;
|
|
|
|
}
|
|
|
|
edits_.clear();
|
|
|
|
}
|
|
|
|
|
|
|
|
void GenVersionEditsToSetFullHistoryTsLow(
|
|
|
|
const std::vector<uint64_t>& ts_lbs) {
|
|
|
|
for (const auto ts_lb : ts_lbs) {
|
|
|
|
VersionEdit* edit = new VersionEdit;
|
|
|
|
edit->SetColumnFamily(cfd_->GetID());
|
|
|
|
std::string ts_str = test::EncodeInt(ts_lb);
|
|
|
|
edit->SetFullHistoryTsLow(ts_str);
|
|
|
|
edits_.emplace_back(edit);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void VerifyFullHistoryTsLow(uint64_t expected_ts_low) {
|
2023-10-27 22:56:48 +00:00
|
|
|
std::unique_ptr<VersionSet> vset(new VersionSet(
|
|
|
|
dbname_, &db_options_, env_options_, table_cache_.get(),
|
|
|
|
&write_buffer_manager_, &write_controller_,
|
|
|
|
/*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
|
2023-11-11 16:11:11 +00:00
|
|
|
/*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"",
|
2023-12-29 02:25:29 +00:00
|
|
|
/*error_handler=*/nullptr, /*read_only=*/false));
|
2020-12-05 22:17:11 +00:00
|
|
|
ASSERT_OK(vset->Recover(column_families_, /*read_only=*/false,
|
|
|
|
/*db_id=*/nullptr));
|
|
|
|
for (auto* cfd : *(vset->GetColumnFamilySet())) {
|
|
|
|
ASSERT_NE(nullptr, cfd);
|
|
|
|
if (cfd->GetName() == kNewCfName) {
|
|
|
|
ASSERT_EQ(test::EncodeInt(expected_ts_low), cfd->GetFullHistoryTsLow());
|
|
|
|
} else {
|
|
|
|
ASSERT_TRUE(cfd->GetFullHistoryTsLow().empty());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void DoTest(const std::vector<uint64_t>& ts_lbs) {
|
|
|
|
if (ts_lbs.empty()) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
GenVersionEditsToSetFullHistoryTsLow(ts_lbs);
|
|
|
|
|
|
|
|
Status s;
|
|
|
|
mutex_.Lock();
|
|
|
|
s = versions_->LogAndApply(cfd_, *(cfd_->GetLatestMutableCFOptions()),
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
read_options_, write_options_, edits_, &mutex_,
|
|
|
|
nullptr);
|
2020-12-05 22:17:11 +00:00
|
|
|
mutex_.Unlock();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
VerifyFullHistoryTsLow(*std::max_element(ts_lbs.begin(), ts_lbs.end()));
|
|
|
|
}
|
|
|
|
|
|
|
|
protected:
|
|
|
|
ColumnFamilyData* cfd_{nullptr};
|
|
|
|
// edits_ must contain and own pointers to heap-alloc VersionEdit objects.
|
|
|
|
autovector<VersionEdit*> edits_;
|
2023-04-21 16:07:18 +00:00
|
|
|
|
|
|
|
private:
|
|
|
|
const ReadOptions read_options_;
|
2020-12-05 22:17:11 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
const std::string VersionSetWithTimestampTest::kNewCfName("new_cf");
|
|
|
|
|
|
|
|
TEST_F(VersionSetWithTimestampTest, SetFullHistoryTsLbOnce) {
|
|
|
|
constexpr uint64_t kTsLow = 100;
|
|
|
|
DoTest({kTsLow});
|
|
|
|
}
|
|
|
|
|
|
|
|
// Simulate the application increasing full_history_ts_low.
|
|
|
|
TEST_F(VersionSetWithTimestampTest, IncreaseFullHistoryTsLb) {
|
|
|
|
const std::vector<uint64_t> ts_lbs = {100, 101, 102, 103};
|
|
|
|
DoTest(ts_lbs);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Simulate the application trying to decrease full_history_ts_low
|
|
|
|
// unsuccessfully. If the application calls public API sequentially to
|
|
|
|
// decrease the lower bound ts, RocksDB will return an InvalidArgument
|
|
|
|
// status before involving VersionSet. Only when multiple threads trying
|
|
|
|
// to decrease the lower bound concurrently will this case ever happen. Even
|
|
|
|
// so, the lower bound cannot be decreased. The application will be notified
|
|
|
|
// via return value of the API.
|
|
|
|
TEST_F(VersionSetWithTimestampTest, TryDecreaseFullHistoryTsLb) {
|
|
|
|
const std::vector<uint64_t> ts_lbs = {103, 102, 101, 100};
|
|
|
|
DoTest(ts_lbs);
|
|
|
|
}
|
|
|
|
|
2019-06-04 17:51:22 +00:00
|
|
|
class VersionSetAtomicGroupTest : public VersionSetTestBase,
|
|
|
|
public testing::Test {
|
|
|
|
public:
|
2020-03-21 02:17:54 +00:00
|
|
|
VersionSetAtomicGroupTest()
|
|
|
|
: VersionSetTestBase("version_set_atomic_group_test") {}
|
2018-10-30 23:35:58 +00:00
|
|
|
|
Best-effort recovery support for atomic flush (#12406)
Summary:
This PR updates `VersionEditHandlerPointInTime` to recover all or none of the updates in an AtomicGroup. This makes best-effort recovery properly handle atomic flushes during recovery, so the features are now allowed to both be enabled at once.
The new logic requires that AtomicGroups do not contain column family additions or removals. AtomicGroups are currently written for atomic flush, which does not include such edits.
Column family additions or removals are recovered independently of AtomicGroups. The new logic needs to be aware of removal, though, so that a dropped CF does not prevent completion of an AtomicGroup recovery.
The new logic treats each AtomicGroup as if it contains updates for all existing column families, even though it is possible to create AtomicGroups that only affect a subset of column families. This simplifies the logic at the expense of recovering less data in certain edge case scenarios.
The usage of `MaybeCreateVersion()` is pretty tricky. The goal is to create a barrier at the start of an AtomicGroup such that all valid states up to that point will be applied to `versions_`. Here is a summary.
- `MaybeCreateVersion(..., false)` creates a `Version` on a negative edge trigger (transition from valid to invalid). It was previously called when applying each update. Now, it is only called when applying non-AtomicGroup updates.
- `MaybeCreateVersion(..., true)` creates a `Version` on a positive level trigger (valid state). It was previously called only at the end of iteration. Now, it is additionally called before processing an AtomicGroup.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12406
Reviewed By: jaykorean, cbi42
Differential Revision: D54494904
Pulled By: ajkr
fbshipit-source-id: 0114a9fe1d04b471d086dcab5978ea8a3a56ad52
2024-03-06 22:40:40 +00:00
|
|
|
explicit VersionSetAtomicGroupTest(const std::string& name)
|
|
|
|
: VersionSetTestBase(name) {}
|
|
|
|
|
2019-06-04 17:51:22 +00:00
|
|
|
void SetUp() override {
|
|
|
|
PrepareManifest(&column_families_, &last_seqno_, &log_writer_);
|
|
|
|
SetupTestSyncPoints();
|
2018-10-30 23:35:58 +00:00
|
|
|
}
|
|
|
|
|
2019-06-04 17:51:22 +00:00
|
|
|
void SetupValidAtomicGroup(int atomic_group_size) {
|
|
|
|
edits_.resize(atomic_group_size);
|
|
|
|
int remaining = atomic_group_size;
|
|
|
|
for (size_t i = 0; i != edits_.size(); ++i) {
|
|
|
|
edits_[i].SetLogNumber(0);
|
|
|
|
edits_[i].SetNextFile(2);
|
|
|
|
edits_[i].MarkAtomicGroup(--remaining);
|
|
|
|
edits_[i].SetLastSequence(last_seqno_++);
|
|
|
|
}
|
2024-08-24 02:49:25 +00:00
|
|
|
CreateCurrentFile();
|
2019-06-04 17:51:22 +00:00
|
|
|
}
|
2018-10-30 23:35:58 +00:00
|
|
|
|
2019-06-04 17:51:22 +00:00
|
|
|
void SetupIncompleteTrailingAtomicGroup(int atomic_group_size) {
|
|
|
|
edits_.resize(atomic_group_size);
|
|
|
|
int remaining = atomic_group_size;
|
|
|
|
for (size_t i = 0; i != edits_.size(); ++i) {
|
|
|
|
edits_[i].SetLogNumber(0);
|
|
|
|
edits_[i].SetNextFile(2);
|
|
|
|
edits_[i].MarkAtomicGroup(--remaining);
|
|
|
|
edits_[i].SetLastSequence(last_seqno_++);
|
|
|
|
}
|
2024-08-24 02:49:25 +00:00
|
|
|
CreateCurrentFile();
|
2019-06-04 17:51:22 +00:00
|
|
|
}
|
2018-10-30 23:35:58 +00:00
|
|
|
|
2019-06-04 17:51:22 +00:00
|
|
|
void SetupCorruptedAtomicGroup(int atomic_group_size) {
|
|
|
|
edits_.resize(atomic_group_size);
|
|
|
|
int remaining = atomic_group_size;
|
|
|
|
for (size_t i = 0; i != edits_.size(); ++i) {
|
|
|
|
edits_[i].SetLogNumber(0);
|
|
|
|
edits_[i].SetNextFile(2);
|
|
|
|
if (i != ((size_t)atomic_group_size / 2)) {
|
|
|
|
edits_[i].MarkAtomicGroup(--remaining);
|
|
|
|
}
|
|
|
|
edits_[i].SetLastSequence(last_seqno_++);
|
|
|
|
}
|
2024-08-24 02:49:25 +00:00
|
|
|
CreateCurrentFile();
|
2019-06-04 17:51:22 +00:00
|
|
|
}
|
2018-10-30 23:35:58 +00:00
|
|
|
|
2019-06-04 17:51:22 +00:00
|
|
|
void SetupIncorrectAtomicGroup(int atomic_group_size) {
|
|
|
|
edits_.resize(atomic_group_size);
|
|
|
|
int remaining = atomic_group_size;
|
|
|
|
for (size_t i = 0; i != edits_.size(); ++i) {
|
|
|
|
edits_[i].SetLogNumber(0);
|
|
|
|
edits_[i].SetNextFile(2);
|
|
|
|
if (i != 1) {
|
|
|
|
edits_[i].MarkAtomicGroup(--remaining);
|
|
|
|
} else {
|
|
|
|
edits_[i].MarkAtomicGroup(remaining--);
|
|
|
|
}
|
|
|
|
edits_[i].SetLastSequence(last_seqno_++);
|
|
|
|
}
|
2024-08-24 02:49:25 +00:00
|
|
|
CreateCurrentFile();
|
2019-06-04 17:51:22 +00:00
|
|
|
}
|
2018-10-30 23:35:58 +00:00
|
|
|
|
2019-06-04 17:51:22 +00:00
|
|
|
void SetupTestSyncPoints() {
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
SyncPoint::GetInstance()->ClearAllCallBacks();
|
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"AtomicGroupReadBuffer::AddEdit:FirstInAtomicGroup", [&](void* arg) {
|
Prefer static_cast in place of most reinterpret_cast (#12308)
Summary:
The following are risks associated with pointer-to-pointer reinterpret_cast:
* Can produce the "wrong result" (crash or memory corruption). IIRC, in theory this can happen for any up-cast or down-cast for a non-standard-layout type, though in practice would only happen for multiple inheritance cases (where the base class pointer might be "inside" the derived object). We don't use multiple inheritance a lot, but we do.
* Can mask useful compiler errors upon code change, including converting between unrelated pointer types that you are expecting to be related, and converting between pointer and scalar types unintentionally.
I can only think of some obscure cases where static_cast could be troublesome when it compiles as a replacement:
* Going through `void*` could plausibly cause unnecessary or broken pointer arithmetic. Suppose we have
`struct Derived: public Base1, public Base2`. If we have `Derived*` -> `void*` -> `Base2*` -> `Derived*` through reinterpret casts, this could plausibly work (though technical UB) assuming the `Base2*` is not dereferenced. Changing to static cast could introduce breaking pointer arithmetic.
* Unnecessary (but safe) pointer arithmetic could arise in a case like `Derived*` -> `Base2*` -> `Derived*` where before the Base2 pointer might not have been dereferenced. This could potentially affect performance.
With some light scripting, I tried replacing pointer-to-pointer reinterpret_casts with static_cast and kept the cases that still compile. Most occurrences of reinterpret_cast have successfully been changed (except for java/ and third-party/). 294 changed, 257 remain.
A couple of related interventions included here:
* Previously Cache::Handle was not actually derived from in the implementations and just used as a `void*` stand-in with reinterpret_cast. Now there is a relationship to allow static_cast. In theory, this could introduce pointer arithmetic (as described above) but is unlikely without multiple inheritance AND non-empty Cache::Handle.
* Remove some unnecessary casts to void* as this is allowed to be implicit (for better or worse).
Most of the remaining reinterpret_casts are for converting to/from raw bytes of objects. We could consider better idioms for these patterns in follow-up work.
I wish there were a way to implement a template variant of static_cast that would only compile if no pointer arithmetic is generated, but best I can tell, this is not possible. AFAIK the best you could do is a dynamic check that the void* conversion after the static cast is unchanged.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12308
Test Plan: existing tests, CI
Reviewed By: ltamasi
Differential Revision: D53204947
Pulled By: pdillinger
fbshipit-source-id: 9de23e618263b0d5b9820f4e15966876888a16e2
2024-02-07 18:44:11 +00:00
|
|
|
VersionEdit* e = static_cast<VersionEdit*>(arg);
|
2019-06-04 17:51:22 +00:00
|
|
|
EXPECT_EQ(edits_.front().DebugString(),
|
|
|
|
e->DebugString()); // compare based on value
|
|
|
|
first_in_atomic_group_ = true;
|
|
|
|
});
|
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"AtomicGroupReadBuffer::AddEdit:LastInAtomicGroup", [&](void* arg) {
|
Prefer static_cast in place of most reinterpret_cast (#12308)
Summary:
The following are risks associated with pointer-to-pointer reinterpret_cast:
* Can produce the "wrong result" (crash or memory corruption). IIRC, in theory this can happen for any up-cast or down-cast for a non-standard-layout type, though in practice would only happen for multiple inheritance cases (where the base class pointer might be "inside" the derived object). We don't use multiple inheritance a lot, but we do.
* Can mask useful compiler errors upon code change, including converting between unrelated pointer types that you are expecting to be related, and converting between pointer and scalar types unintentionally.
I can only think of some obscure cases where static_cast could be troublesome when it compiles as a replacement:
* Going through `void*` could plausibly cause unnecessary or broken pointer arithmetic. Suppose we have
`struct Derived: public Base1, public Base2`. If we have `Derived*` -> `void*` -> `Base2*` -> `Derived*` through reinterpret casts, this could plausibly work (though technical UB) assuming the `Base2*` is not dereferenced. Changing to static cast could introduce breaking pointer arithmetic.
* Unnecessary (but safe) pointer arithmetic could arise in a case like `Derived*` -> `Base2*` -> `Derived*` where before the Base2 pointer might not have been dereferenced. This could potentially affect performance.
With some light scripting, I tried replacing pointer-to-pointer reinterpret_casts with static_cast and kept the cases that still compile. Most occurrences of reinterpret_cast have successfully been changed (except for java/ and third-party/). 294 changed, 257 remain.
A couple of related interventions included here:
* Previously Cache::Handle was not actually derived from in the implementations and just used as a `void*` stand-in with reinterpret_cast. Now there is a relationship to allow static_cast. In theory, this could introduce pointer arithmetic (as described above) but is unlikely without multiple inheritance AND non-empty Cache::Handle.
* Remove some unnecessary casts to void* as this is allowed to be implicit (for better or worse).
Most of the remaining reinterpret_casts are for converting to/from raw bytes of objects. We could consider better idioms for these patterns in follow-up work.
I wish there were a way to implement a template variant of static_cast that would only compile if no pointer arithmetic is generated, but best I can tell, this is not possible. AFAIK the best you could do is a dynamic check that the void* conversion after the static cast is unchanged.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12308
Test Plan: existing tests, CI
Reviewed By: ltamasi
Differential Revision: D53204947
Pulled By: pdillinger
fbshipit-source-id: 9de23e618263b0d5b9820f4e15966876888a16e2
2024-02-07 18:44:11 +00:00
|
|
|
VersionEdit* e = static_cast<VersionEdit*>(arg);
|
2019-06-04 17:51:22 +00:00
|
|
|
EXPECT_EQ(edits_.back().DebugString(),
|
|
|
|
e->DebugString()); // compare based on value
|
|
|
|
EXPECT_TRUE(first_in_atomic_group_);
|
|
|
|
last_in_atomic_group_ = true;
|
|
|
|
});
|
2020-11-11 15:58:15 +00:00
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
Prefer static_cast in place of most reinterpret_cast (#12308)
Summary:
The following are risks associated with pointer-to-pointer reinterpret_cast:
* Can produce the "wrong result" (crash or memory corruption). IIRC, in theory this can happen for any up-cast or down-cast for a non-standard-layout type, though in practice would only happen for multiple inheritance cases (where the base class pointer might be "inside" the derived object). We don't use multiple inheritance a lot, but we do.
* Can mask useful compiler errors upon code change, including converting between unrelated pointer types that you are expecting to be related, and converting between pointer and scalar types unintentionally.
I can only think of some obscure cases where static_cast could be troublesome when it compiles as a replacement:
* Going through `void*` could plausibly cause unnecessary or broken pointer arithmetic. Suppose we have
`struct Derived: public Base1, public Base2`. If we have `Derived*` -> `void*` -> `Base2*` -> `Derived*` through reinterpret casts, this could plausibly work (though technical UB) assuming the `Base2*` is not dereferenced. Changing to static cast could introduce breaking pointer arithmetic.
* Unnecessary (but safe) pointer arithmetic could arise in a case like `Derived*` -> `Base2*` -> `Derived*` where before the Base2 pointer might not have been dereferenced. This could potentially affect performance.
With some light scripting, I tried replacing pointer-to-pointer reinterpret_casts with static_cast and kept the cases that still compile. Most occurrences of reinterpret_cast have successfully been changed (except for java/ and third-party/). 294 changed, 257 remain.
A couple of related interventions included here:
* Previously Cache::Handle was not actually derived from in the implementations and just used as a `void*` stand-in with reinterpret_cast. Now there is a relationship to allow static_cast. In theory, this could introduce pointer arithmetic (as described above) but is unlikely without multiple inheritance AND non-empty Cache::Handle.
* Remove some unnecessary casts to void* as this is allowed to be implicit (for better or worse).
Most of the remaining reinterpret_casts are for converting to/from raw bytes of objects. We could consider better idioms for these patterns in follow-up work.
I wish there were a way to implement a template variant of static_cast that would only compile if no pointer arithmetic is generated, but best I can tell, this is not possible. AFAIK the best you could do is a dynamic check that the void* conversion after the static cast is unchanged.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12308
Test Plan: existing tests, CI
Reviewed By: ltamasi
Differential Revision: D53204947
Pulled By: pdillinger
fbshipit-source-id: 9de23e618263b0d5b9820f4e15966876888a16e2
2024-02-07 18:44:11 +00:00
|
|
|
"VersionEditHandlerBase::Iterate:Finish",
|
|
|
|
[&](void* arg) { num_recovered_edits_ = *static_cast<size_t*>(arg); });
|
2019-06-04 17:51:22 +00:00
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"AtomicGroupReadBuffer::AddEdit:AtomicGroup",
|
|
|
|
[&](void* /* arg */) { ++num_edits_in_atomic_group_; });
|
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"AtomicGroupReadBuffer::AddEdit:AtomicGroupMixedWithNormalEdits",
|
Prefer static_cast in place of most reinterpret_cast (#12308)
Summary:
The following are risks associated with pointer-to-pointer reinterpret_cast:
* Can produce the "wrong result" (crash or memory corruption). IIRC, in theory this can happen for any up-cast or down-cast for a non-standard-layout type, though in practice would only happen for multiple inheritance cases (where the base class pointer might be "inside" the derived object). We don't use multiple inheritance a lot, but we do.
* Can mask useful compiler errors upon code change, including converting between unrelated pointer types that you are expecting to be related, and converting between pointer and scalar types unintentionally.
I can only think of some obscure cases where static_cast could be troublesome when it compiles as a replacement:
* Going through `void*` could plausibly cause unnecessary or broken pointer arithmetic. Suppose we have
`struct Derived: public Base1, public Base2`. If we have `Derived*` -> `void*` -> `Base2*` -> `Derived*` through reinterpret casts, this could plausibly work (though technical UB) assuming the `Base2*` is not dereferenced. Changing to static cast could introduce breaking pointer arithmetic.
* Unnecessary (but safe) pointer arithmetic could arise in a case like `Derived*` -> `Base2*` -> `Derived*` where before the Base2 pointer might not have been dereferenced. This could potentially affect performance.
With some light scripting, I tried replacing pointer-to-pointer reinterpret_casts with static_cast and kept the cases that still compile. Most occurrences of reinterpret_cast have successfully been changed (except for java/ and third-party/). 294 changed, 257 remain.
A couple of related interventions included here:
* Previously Cache::Handle was not actually derived from in the implementations and just used as a `void*` stand-in with reinterpret_cast. Now there is a relationship to allow static_cast. In theory, this could introduce pointer arithmetic (as described above) but is unlikely without multiple inheritance AND non-empty Cache::Handle.
* Remove some unnecessary casts to void* as this is allowed to be implicit (for better or worse).
Most of the remaining reinterpret_casts are for converting to/from raw bytes of objects. We could consider better idioms for these patterns in follow-up work.
I wish there were a way to implement a template variant of static_cast that would only compile if no pointer arithmetic is generated, but best I can tell, this is not possible. AFAIK the best you could do is a dynamic check that the void* conversion after the static cast is unchanged.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12308
Test Plan: existing tests, CI
Reviewed By: ltamasi
Differential Revision: D53204947
Pulled By: pdillinger
fbshipit-source-id: 9de23e618263b0d5b9820f4e15966876888a16e2
2024-02-07 18:44:11 +00:00
|
|
|
[&](void* arg) { corrupted_edit_ = *static_cast<VersionEdit*>(arg); });
|
2019-06-04 17:51:22 +00:00
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"AtomicGroupReadBuffer::AddEdit:IncorrectAtomicGroupSize",
|
|
|
|
[&](void* arg) {
|
Prefer static_cast in place of most reinterpret_cast (#12308)
Summary:
The following are risks associated with pointer-to-pointer reinterpret_cast:
* Can produce the "wrong result" (crash or memory corruption). IIRC, in theory this can happen for any up-cast or down-cast for a non-standard-layout type, though in practice would only happen for multiple inheritance cases (where the base class pointer might be "inside" the derived object). We don't use multiple inheritance a lot, but we do.
* Can mask useful compiler errors upon code change, including converting between unrelated pointer types that you are expecting to be related, and converting between pointer and scalar types unintentionally.
I can only think of some obscure cases where static_cast could be troublesome when it compiles as a replacement:
* Going through `void*` could plausibly cause unnecessary or broken pointer arithmetic. Suppose we have
`struct Derived: public Base1, public Base2`. If we have `Derived*` -> `void*` -> `Base2*` -> `Derived*` through reinterpret casts, this could plausibly work (though technical UB) assuming the `Base2*` is not dereferenced. Changing to static cast could introduce breaking pointer arithmetic.
* Unnecessary (but safe) pointer arithmetic could arise in a case like `Derived*` -> `Base2*` -> `Derived*` where before the Base2 pointer might not have been dereferenced. This could potentially affect performance.
With some light scripting, I tried replacing pointer-to-pointer reinterpret_casts with static_cast and kept the cases that still compile. Most occurrences of reinterpret_cast have successfully been changed (except for java/ and third-party/). 294 changed, 257 remain.
A couple of related interventions included here:
* Previously Cache::Handle was not actually derived from in the implementations and just used as a `void*` stand-in with reinterpret_cast. Now there is a relationship to allow static_cast. In theory, this could introduce pointer arithmetic (as described above) but is unlikely without multiple inheritance AND non-empty Cache::Handle.
* Remove some unnecessary casts to void* as this is allowed to be implicit (for better or worse).
Most of the remaining reinterpret_casts are for converting to/from raw bytes of objects. We could consider better idioms for these patterns in follow-up work.
I wish there were a way to implement a template variant of static_cast that would only compile if no pointer arithmetic is generated, but best I can tell, this is not possible. AFAIK the best you could do is a dynamic check that the void* conversion after the static cast is unchanged.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12308
Test Plan: existing tests, CI
Reviewed By: ltamasi
Differential Revision: D53204947
Pulled By: pdillinger
fbshipit-source-id: 9de23e618263b0d5b9820f4e15966876888a16e2
2024-02-07 18:44:11 +00:00
|
|
|
edit_with_incorrect_group_size_ = *static_cast<VersionEdit*>(arg);
|
2019-06-04 17:51:22 +00:00
|
|
|
});
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
}
|
2018-10-30 23:35:58 +00:00
|
|
|
|
2019-06-04 17:51:22 +00:00
|
|
|
void AddNewEditsToLog(int num_edits) {
|
|
|
|
for (int i = 0; i < num_edits; i++) {
|
|
|
|
std::string record;
|
Best-effort recovery support for atomic flush (#12406)
Summary:
This PR updates `VersionEditHandlerPointInTime` to recover all or none of the updates in an AtomicGroup. This makes best-effort recovery properly handle atomic flushes during recovery, so the features are now allowed to both be enabled at once.
The new logic requires that AtomicGroups do not contain column family additions or removals. AtomicGroups are currently written for atomic flush, which does not include such edits.
Column family additions or removals are recovered independently of AtomicGroups. The new logic needs to be aware of removal, though, so that a dropped CF does not prevent completion of an AtomicGroup recovery.
The new logic treats each AtomicGroup as if it contains updates for all existing column families, even though it is possible to create AtomicGroups that only affect a subset of column families. This simplifies the logic at the expense of recovering less data in certain edge case scenarios.
The usage of `MaybeCreateVersion()` is pretty tricky. The goal is to create a barrier at the start of an AtomicGroup such that all valid states up to that point will be applied to `versions_`. Here is a summary.
- `MaybeCreateVersion(..., false)` creates a `Version` on a negative edge trigger (transition from valid to invalid). It was previously called when applying each update. Now, it is only called when applying non-AtomicGroup updates.
- `MaybeCreateVersion(..., true)` creates a `Version` on a positive level trigger (valid state). It was previously called only at the end of iteration. Now, it is additionally called before processing an AtomicGroup.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12406
Reviewed By: jaykorean, cbi42
Differential Revision: D54494904
Pulled By: ajkr
fbshipit-source-id: 0114a9fe1d04b471d086dcab5978ea8a3a56ad52
2024-03-06 22:40:40 +00:00
|
|
|
edits_[i].EncodeTo(&record, 0 /* ts_sz */);
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
ASSERT_OK(log_writer_->AddRecord(WriteOptions(), record));
|
2019-06-04 17:51:22 +00:00
|
|
|
}
|
2018-10-30 23:35:58 +00:00
|
|
|
}
|
2019-06-04 17:51:22 +00:00
|
|
|
|
|
|
|
void TearDown() override {
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
SyncPoint::GetInstance()->ClearAllCallBacks();
|
|
|
|
log_writer_.reset();
|
2018-10-30 23:35:58 +00:00
|
|
|
}
|
|
|
|
|
2019-06-04 17:51:22 +00:00
|
|
|
protected:
|
|
|
|
std::vector<ColumnFamilyDescriptor> column_families_;
|
|
|
|
SequenceNumber last_seqno_;
|
|
|
|
std::vector<VersionEdit> edits_;
|
|
|
|
bool first_in_atomic_group_ = false;
|
|
|
|
bool last_in_atomic_group_ = false;
|
|
|
|
int num_edits_in_atomic_group_ = 0;
|
2021-10-22 17:12:09 +00:00
|
|
|
size_t num_recovered_edits_ = 0;
|
2019-06-04 17:51:22 +00:00
|
|
|
VersionEdit corrupted_edit_;
|
|
|
|
VersionEdit edit_with_incorrect_group_size_;
|
|
|
|
std::unique_ptr<log::Writer> log_writer_;
|
|
|
|
};
|
2018-10-30 23:35:58 +00:00
|
|
|
|
2019-06-04 17:51:22 +00:00
|
|
|
TEST_F(VersionSetAtomicGroupTest, HandleValidAtomicGroupWithVersionSetRecover) {
|
|
|
|
const int kAtomicGroupSize = 3;
|
|
|
|
SetupValidAtomicGroup(kAtomicGroupSize);
|
|
|
|
AddNewEditsToLog(kAtomicGroupSize);
|
|
|
|
EXPECT_OK(versions_->Recover(column_families_, false));
|
|
|
|
EXPECT_EQ(column_families_.size(),
|
|
|
|
versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
|
|
|
|
EXPECT_TRUE(first_in_atomic_group_);
|
|
|
|
EXPECT_TRUE(last_in_atomic_group_);
|
|
|
|
EXPECT_EQ(num_initial_edits_ + kAtomicGroupSize, num_recovered_edits_);
|
|
|
|
}
|
2018-10-30 23:35:58 +00:00
|
|
|
|
2019-06-04 17:51:22 +00:00
|
|
|
TEST_F(VersionSetAtomicGroupTest,
|
|
|
|
HandleValidAtomicGroupWithReactiveVersionSetRecover) {
|
|
|
|
const int kAtomicGroupSize = 3;
|
|
|
|
SetupValidAtomicGroup(kAtomicGroupSize);
|
|
|
|
AddNewEditsToLog(kAtomicGroupSize);
|
|
|
|
std::unique_ptr<log::FragmentBufferedReader> manifest_reader;
|
|
|
|
std::unique_ptr<log::Reader::Reporter> manifest_reporter;
|
|
|
|
std::unique_ptr<Status> manifest_reader_status;
|
|
|
|
EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader,
|
|
|
|
&manifest_reporter,
|
|
|
|
&manifest_reader_status));
|
|
|
|
EXPECT_EQ(column_families_.size(),
|
|
|
|
reactive_versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
|
|
|
|
EXPECT_TRUE(first_in_atomic_group_);
|
|
|
|
EXPECT_TRUE(last_in_atomic_group_);
|
|
|
|
// The recover should clean up the replay buffer.
|
|
|
|
EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() == 0);
|
|
|
|
EXPECT_TRUE(reactive_versions_->replay_buffer().size() == 0);
|
|
|
|
EXPECT_EQ(num_initial_edits_ + kAtomicGroupSize, num_recovered_edits_);
|
|
|
|
}
|
2018-10-30 23:35:58 +00:00
|
|
|
|
2019-06-04 17:51:22 +00:00
|
|
|
TEST_F(VersionSetAtomicGroupTest,
|
|
|
|
HandleValidAtomicGroupWithReactiveVersionSetReadAndApply) {
|
|
|
|
const int kAtomicGroupSize = 3;
|
|
|
|
SetupValidAtomicGroup(kAtomicGroupSize);
|
|
|
|
std::unique_ptr<log::FragmentBufferedReader> manifest_reader;
|
|
|
|
std::unique_ptr<log::Reader::Reporter> manifest_reporter;
|
|
|
|
std::unique_ptr<Status> manifest_reader_status;
|
|
|
|
EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader,
|
|
|
|
&manifest_reporter,
|
|
|
|
&manifest_reader_status));
|
2021-03-10 18:58:07 +00:00
|
|
|
EXPECT_EQ(num_initial_edits_, num_recovered_edits_);
|
2019-06-04 17:51:22 +00:00
|
|
|
AddNewEditsToLog(kAtomicGroupSize);
|
|
|
|
InstrumentedMutex mu;
|
|
|
|
std::unordered_set<ColumnFamilyData*> cfds_changed;
|
|
|
|
mu.Lock();
|
2021-03-10 18:58:07 +00:00
|
|
|
EXPECT_OK(reactive_versions_->ReadAndApply(
|
2024-05-18 02:13:33 +00:00
|
|
|
&mu, &manifest_reader, manifest_reader_status.get(), &cfds_changed,
|
|
|
|
/*files_to_delete=*/nullptr));
|
2019-06-04 17:51:22 +00:00
|
|
|
mu.Unlock();
|
|
|
|
EXPECT_TRUE(first_in_atomic_group_);
|
|
|
|
EXPECT_TRUE(last_in_atomic_group_);
|
|
|
|
// The recover should clean up the replay buffer.
|
|
|
|
EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() == 0);
|
|
|
|
EXPECT_TRUE(reactive_versions_->replay_buffer().size() == 0);
|
2021-03-10 18:58:07 +00:00
|
|
|
EXPECT_EQ(kAtomicGroupSize, num_recovered_edits_);
|
2019-06-04 17:51:22 +00:00
|
|
|
}
|
2018-10-30 23:35:58 +00:00
|
|
|
|
2019-06-04 17:51:22 +00:00
|
|
|
TEST_F(VersionSetAtomicGroupTest,
|
|
|
|
HandleIncompleteTrailingAtomicGroupWithVersionSetRecover) {
|
|
|
|
const int kAtomicGroupSize = 4;
|
|
|
|
const int kNumberOfPersistedVersionEdits = kAtomicGroupSize - 1;
|
|
|
|
SetupIncompleteTrailingAtomicGroup(kAtomicGroupSize);
|
|
|
|
AddNewEditsToLog(kNumberOfPersistedVersionEdits);
|
|
|
|
EXPECT_OK(versions_->Recover(column_families_, false));
|
|
|
|
EXPECT_EQ(column_families_.size(),
|
2018-10-30 23:35:58 +00:00
|
|
|
versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
|
2019-06-04 17:51:22 +00:00
|
|
|
EXPECT_TRUE(first_in_atomic_group_);
|
|
|
|
EXPECT_FALSE(last_in_atomic_group_);
|
|
|
|
EXPECT_EQ(kNumberOfPersistedVersionEdits, num_edits_in_atomic_group_);
|
|
|
|
EXPECT_EQ(num_initial_edits_, num_recovered_edits_);
|
2018-10-30 23:35:58 +00:00
|
|
|
}
|
|
|
|
|
2019-06-04 17:51:22 +00:00
|
|
|
TEST_F(VersionSetAtomicGroupTest,
|
|
|
|
HandleIncompleteTrailingAtomicGroupWithReactiveVersionSetRecover) {
|
2018-10-30 23:35:58 +00:00
|
|
|
const int kAtomicGroupSize = 4;
|
2019-06-04 17:51:22 +00:00
|
|
|
const int kNumberOfPersistedVersionEdits = kAtomicGroupSize - 1;
|
|
|
|
SetupIncompleteTrailingAtomicGroup(kAtomicGroupSize);
|
|
|
|
AddNewEditsToLog(kNumberOfPersistedVersionEdits);
|
|
|
|
std::unique_ptr<log::FragmentBufferedReader> manifest_reader;
|
|
|
|
std::unique_ptr<log::Reader::Reporter> manifest_reporter;
|
|
|
|
std::unique_ptr<Status> manifest_reader_status;
|
|
|
|
EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader,
|
|
|
|
&manifest_reporter,
|
|
|
|
&manifest_reader_status));
|
|
|
|
EXPECT_EQ(column_families_.size(),
|
|
|
|
reactive_versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
|
|
|
|
EXPECT_TRUE(first_in_atomic_group_);
|
|
|
|
EXPECT_FALSE(last_in_atomic_group_);
|
|
|
|
EXPECT_EQ(kNumberOfPersistedVersionEdits, num_edits_in_atomic_group_);
|
|
|
|
// Reactive version set should store the edits in the replay buffer.
|
|
|
|
EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() ==
|
|
|
|
kNumberOfPersistedVersionEdits);
|
|
|
|
EXPECT_TRUE(reactive_versions_->replay_buffer().size() == kAtomicGroupSize);
|
|
|
|
// Write the last record. The reactive version set should now apply all
|
|
|
|
// edits.
|
|
|
|
std::string last_record;
|
|
|
|
edits_[kAtomicGroupSize - 1].EncodeTo(&last_record);
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
EXPECT_OK(log_writer_->AddRecord(WriteOptions(), last_record));
|
2019-06-04 17:51:22 +00:00
|
|
|
InstrumentedMutex mu;
|
|
|
|
std::unordered_set<ColumnFamilyData*> cfds_changed;
|
|
|
|
mu.Lock();
|
2021-03-10 18:58:07 +00:00
|
|
|
EXPECT_OK(reactive_versions_->ReadAndApply(
|
2024-05-18 02:13:33 +00:00
|
|
|
&mu, &manifest_reader, manifest_reader_status.get(), &cfds_changed,
|
|
|
|
/*files_to_delete=*/nullptr));
|
2019-06-04 17:51:22 +00:00
|
|
|
mu.Unlock();
|
|
|
|
// Reactive version set should be empty now.
|
|
|
|
EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() == 0);
|
|
|
|
EXPECT_TRUE(reactive_versions_->replay_buffer().size() == 0);
|
|
|
|
EXPECT_EQ(num_initial_edits_, num_recovered_edits_);
|
|
|
|
}
|
2018-10-30 23:35:58 +00:00
|
|
|
|
2019-06-04 17:51:22 +00:00
|
|
|
TEST_F(VersionSetAtomicGroupTest,
|
|
|
|
HandleIncompleteTrailingAtomicGroupWithReactiveVersionSetReadAndApply) {
|
|
|
|
const int kAtomicGroupSize = 4;
|
|
|
|
const int kNumberOfPersistedVersionEdits = kAtomicGroupSize - 1;
|
|
|
|
SetupIncompleteTrailingAtomicGroup(kAtomicGroupSize);
|
|
|
|
std::unique_ptr<log::FragmentBufferedReader> manifest_reader;
|
|
|
|
std::unique_ptr<log::Reader::Reporter> manifest_reporter;
|
|
|
|
std::unique_ptr<Status> manifest_reader_status;
|
|
|
|
// No edits in an atomic group.
|
|
|
|
EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader,
|
|
|
|
&manifest_reporter,
|
|
|
|
&manifest_reader_status));
|
|
|
|
EXPECT_EQ(column_families_.size(),
|
|
|
|
reactive_versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
|
2021-03-10 18:58:07 +00:00
|
|
|
EXPECT_EQ(num_initial_edits_, num_recovered_edits_);
|
2019-06-04 17:51:22 +00:00
|
|
|
// Write a few edits in an atomic group.
|
|
|
|
AddNewEditsToLog(kNumberOfPersistedVersionEdits);
|
|
|
|
InstrumentedMutex mu;
|
|
|
|
std::unordered_set<ColumnFamilyData*> cfds_changed;
|
|
|
|
mu.Lock();
|
2021-03-10 18:58:07 +00:00
|
|
|
EXPECT_OK(reactive_versions_->ReadAndApply(
|
2024-05-18 02:13:33 +00:00
|
|
|
&mu, &manifest_reader, manifest_reader_status.get(), &cfds_changed,
|
|
|
|
/*files_to_delete=*/nullptr));
|
2019-06-04 17:51:22 +00:00
|
|
|
mu.Unlock();
|
|
|
|
EXPECT_TRUE(first_in_atomic_group_);
|
|
|
|
EXPECT_FALSE(last_in_atomic_group_);
|
|
|
|
EXPECT_EQ(kNumberOfPersistedVersionEdits, num_edits_in_atomic_group_);
|
|
|
|
// Reactive version set should store the edits in the replay buffer.
|
|
|
|
EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() ==
|
|
|
|
kNumberOfPersistedVersionEdits);
|
|
|
|
EXPECT_TRUE(reactive_versions_->replay_buffer().size() == kAtomicGroupSize);
|
|
|
|
}
|
2018-10-30 23:35:58 +00:00
|
|
|
|
2019-06-04 17:51:22 +00:00
|
|
|
TEST_F(VersionSetAtomicGroupTest,
|
|
|
|
HandleCorruptedAtomicGroupWithVersionSetRecover) {
|
|
|
|
const int kAtomicGroupSize = 4;
|
|
|
|
SetupCorruptedAtomicGroup(kAtomicGroupSize);
|
|
|
|
AddNewEditsToLog(kAtomicGroupSize);
|
|
|
|
EXPECT_NOK(versions_->Recover(column_families_, false));
|
|
|
|
EXPECT_EQ(column_families_.size(),
|
2018-10-30 23:35:58 +00:00
|
|
|
versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
|
2019-06-04 17:51:22 +00:00
|
|
|
EXPECT_EQ(edits_[kAtomicGroupSize / 2].DebugString(),
|
|
|
|
corrupted_edit_.DebugString());
|
2018-10-30 23:35:58 +00:00
|
|
|
}
|
|
|
|
|
2019-06-04 17:51:22 +00:00
|
|
|
TEST_F(VersionSetAtomicGroupTest,
|
|
|
|
HandleCorruptedAtomicGroupWithReactiveVersionSetRecover) {
|
|
|
|
const int kAtomicGroupSize = 4;
|
|
|
|
SetupCorruptedAtomicGroup(kAtomicGroupSize);
|
|
|
|
AddNewEditsToLog(kAtomicGroupSize);
|
|
|
|
std::unique_ptr<log::FragmentBufferedReader> manifest_reader;
|
|
|
|
std::unique_ptr<log::Reader::Reporter> manifest_reporter;
|
|
|
|
std::unique_ptr<Status> manifest_reader_status;
|
|
|
|
EXPECT_NOK(reactive_versions_->Recover(column_families_, &manifest_reader,
|
|
|
|
&manifest_reporter,
|
|
|
|
&manifest_reader_status));
|
|
|
|
EXPECT_EQ(column_families_.size(),
|
|
|
|
reactive_versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
|
|
|
|
EXPECT_EQ(edits_[kAtomicGroupSize / 2].DebugString(),
|
|
|
|
corrupted_edit_.DebugString());
|
|
|
|
}
|
2018-10-30 23:35:58 +00:00
|
|
|
|
2019-06-04 17:51:22 +00:00
|
|
|
TEST_F(VersionSetAtomicGroupTest,
|
|
|
|
HandleCorruptedAtomicGroupWithReactiveVersionSetReadAndApply) {
|
2018-10-30 23:35:58 +00:00
|
|
|
const int kAtomicGroupSize = 4;
|
2019-06-04 17:51:22 +00:00
|
|
|
SetupCorruptedAtomicGroup(kAtomicGroupSize);
|
|
|
|
InstrumentedMutex mu;
|
|
|
|
std::unordered_set<ColumnFamilyData*> cfds_changed;
|
|
|
|
std::unique_ptr<log::FragmentBufferedReader> manifest_reader;
|
|
|
|
std::unique_ptr<log::Reader::Reporter> manifest_reporter;
|
|
|
|
std::unique_ptr<Status> manifest_reader_status;
|
|
|
|
EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader,
|
|
|
|
&manifest_reporter,
|
|
|
|
&manifest_reader_status));
|
|
|
|
// Write the corrupted edits.
|
|
|
|
AddNewEditsToLog(kAtomicGroupSize);
|
|
|
|
mu.Lock();
|
2021-03-10 18:58:07 +00:00
|
|
|
EXPECT_NOK(reactive_versions_->ReadAndApply(
|
2024-05-18 02:13:33 +00:00
|
|
|
&mu, &manifest_reader, manifest_reader_status.get(), &cfds_changed,
|
|
|
|
/*files_to_delete=*/nullptr));
|
2019-06-04 17:51:22 +00:00
|
|
|
mu.Unlock();
|
|
|
|
EXPECT_EQ(edits_[kAtomicGroupSize / 2].DebugString(),
|
|
|
|
corrupted_edit_.DebugString());
|
|
|
|
}
|
2018-10-30 23:35:58 +00:00
|
|
|
|
2019-06-04 17:51:22 +00:00
|
|
|
TEST_F(VersionSetAtomicGroupTest,
|
|
|
|
HandleIncorrectAtomicGroupSizeWithVersionSetRecover) {
|
|
|
|
const int kAtomicGroupSize = 4;
|
|
|
|
SetupIncorrectAtomicGroup(kAtomicGroupSize);
|
|
|
|
AddNewEditsToLog(kAtomicGroupSize);
|
|
|
|
EXPECT_NOK(versions_->Recover(column_families_, false));
|
|
|
|
EXPECT_EQ(column_families_.size(),
|
|
|
|
versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
|
|
|
|
EXPECT_EQ(edits_[1].DebugString(),
|
|
|
|
edit_with_incorrect_group_size_.DebugString());
|
|
|
|
}
|
2018-10-30 23:35:58 +00:00
|
|
|
|
2019-06-04 17:51:22 +00:00
|
|
|
TEST_F(VersionSetAtomicGroupTest,
|
|
|
|
HandleIncorrectAtomicGroupSizeWithReactiveVersionSetRecover) {
|
|
|
|
const int kAtomicGroupSize = 4;
|
|
|
|
SetupIncorrectAtomicGroup(kAtomicGroupSize);
|
|
|
|
AddNewEditsToLog(kAtomicGroupSize);
|
|
|
|
std::unique_ptr<log::FragmentBufferedReader> manifest_reader;
|
|
|
|
std::unique_ptr<log::Reader::Reporter> manifest_reporter;
|
|
|
|
std::unique_ptr<Status> manifest_reader_status;
|
|
|
|
EXPECT_NOK(reactive_versions_->Recover(column_families_, &manifest_reader,
|
|
|
|
&manifest_reporter,
|
|
|
|
&manifest_reader_status));
|
|
|
|
EXPECT_EQ(column_families_.size(),
|
|
|
|
reactive_versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
|
|
|
|
EXPECT_EQ(edits_[1].DebugString(),
|
|
|
|
edit_with_incorrect_group_size_.DebugString());
|
|
|
|
}
|
2018-10-30 23:35:58 +00:00
|
|
|
|
2019-06-04 17:51:22 +00:00
|
|
|
TEST_F(VersionSetAtomicGroupTest,
|
|
|
|
HandleIncorrectAtomicGroupSizeWithReactiveVersionSetReadAndApply) {
|
|
|
|
const int kAtomicGroupSize = 4;
|
|
|
|
SetupIncorrectAtomicGroup(kAtomicGroupSize);
|
|
|
|
InstrumentedMutex mu;
|
|
|
|
std::unordered_set<ColumnFamilyData*> cfds_changed;
|
|
|
|
std::unique_ptr<log::FragmentBufferedReader> manifest_reader;
|
|
|
|
std::unique_ptr<log::Reader::Reporter> manifest_reporter;
|
|
|
|
std::unique_ptr<Status> manifest_reader_status;
|
|
|
|
EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader,
|
|
|
|
&manifest_reporter,
|
|
|
|
&manifest_reader_status));
|
|
|
|
AddNewEditsToLog(kAtomicGroupSize);
|
|
|
|
mu.Lock();
|
2021-03-10 18:58:07 +00:00
|
|
|
EXPECT_NOK(reactive_versions_->ReadAndApply(
|
2024-05-18 02:13:33 +00:00
|
|
|
&mu, &manifest_reader, manifest_reader_status.get(), &cfds_changed,
|
|
|
|
/*files_to_delete=*/nullptr));
|
2019-06-04 17:51:22 +00:00
|
|
|
mu.Unlock();
|
|
|
|
EXPECT_EQ(edits_[1].DebugString(),
|
|
|
|
edit_with_incorrect_group_size_.DebugString());
|
2018-10-30 23:35:58 +00:00
|
|
|
}
|
2018-12-13 23:10:16 +00:00
|
|
|
|
Best-effort recovery support for atomic flush (#12406)
Summary:
This PR updates `VersionEditHandlerPointInTime` to recover all or none of the updates in an AtomicGroup. This makes best-effort recovery properly handle atomic flushes during recovery, so the features are now allowed to both be enabled at once.
The new logic requires that AtomicGroups do not contain column family additions or removals. AtomicGroups are currently written for atomic flush, which does not include such edits.
Column family additions or removals are recovered independently of AtomicGroups. The new logic needs to be aware of removal, though, so that a dropped CF does not prevent completion of an AtomicGroup recovery.
The new logic treats each AtomicGroup as if it contains updates for all existing column families, even though it is possible to create AtomicGroups that only affect a subset of column families. This simplifies the logic at the expense of recovering less data in certain edge case scenarios.
The usage of `MaybeCreateVersion()` is pretty tricky. The goal is to create a barrier at the start of an AtomicGroup such that all valid states up to that point will be applied to `versions_`. Here is a summary.
- `MaybeCreateVersion(..., false)` creates a `Version` on a negative edge trigger (transition from valid to invalid). It was previously called when applying each update. Now, it is only called when applying non-AtomicGroup updates.
- `MaybeCreateVersion(..., true)` creates a `Version` on a positive level trigger (valid state). It was previously called only at the end of iteration. Now, it is additionally called before processing an AtomicGroup.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12406
Reviewed By: jaykorean, cbi42
Differential Revision: D54494904
Pulled By: ajkr
fbshipit-source-id: 0114a9fe1d04b471d086dcab5978ea8a3a56ad52
2024-03-06 22:40:40 +00:00
|
|
|
class AtomicGroupBestEffortRecoveryTest : public VersionSetAtomicGroupTest {
|
|
|
|
public:
|
|
|
|
AtomicGroupBestEffortRecoveryTest()
|
|
|
|
: VersionSetAtomicGroupTest("atomic_group_best_effort_recovery_test") {}
|
|
|
|
};
|
|
|
|
|
|
|
|
TEST_F(AtomicGroupBestEffortRecoveryTest,
|
|
|
|
HandleAtomicGroupUpdatesValidInitially) {
|
|
|
|
// One AtomicGroup contains updates that are valid at the outset.
|
|
|
|
std::vector<SstInfo> file_infos;
|
|
|
|
for (int cfid = 0; cfid < kNumColumnFamilies; cfid++) {
|
|
|
|
int file_number = 10 + cfid;
|
|
|
|
file_infos.emplace_back(file_number, column_families_[cfid].name,
|
|
|
|
"" /* key */, 0 /* level */,
|
|
|
|
file_number /* epoch_number */);
|
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<FileMetaData> file_metas;
|
|
|
|
CreateDummyTableFiles(file_infos, &file_metas);
|
|
|
|
|
|
|
|
edits_.clear();
|
|
|
|
for (int cfid = 0; cfid < kNumColumnFamilies; cfid++) {
|
|
|
|
edits_.emplace_back();
|
|
|
|
edits_.back().SetColumnFamily(cfid);
|
|
|
|
edits_.back().AddFile(0 /* level */, file_metas[cfid]);
|
|
|
|
edits_.back().SetLastSequence(++last_seqno_);
|
|
|
|
edits_.back().MarkAtomicGroup(kNumColumnFamilies - 1 -
|
|
|
|
cfid /* remaining_entries */);
|
|
|
|
}
|
|
|
|
AddNewEditsToLog(kNumColumnFamilies);
|
|
|
|
|
|
|
|
{
|
|
|
|
bool has_missing_table_file;
|
|
|
|
ASSERT_OK(versions_->TryRecover(column_families_, false /* read_only */,
|
|
|
|
{DescriptorFileName(1 /* number */)},
|
|
|
|
nullptr /* db_id */,
|
|
|
|
&has_missing_table_file));
|
|
|
|
ASSERT_FALSE(has_missing_table_file);
|
|
|
|
}
|
|
|
|
std::vector<uint64_t> all_table_files;
|
|
|
|
std::vector<uint64_t> all_blob_files;
|
|
|
|
versions_->AddLiveFiles(&all_table_files, &all_blob_files);
|
|
|
|
ASSERT_EQ(file_metas.size(), all_table_files.size());
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(AtomicGroupBestEffortRecoveryTest, HandleAtomicGroupUpdatesValidLater) {
|
|
|
|
// One AtomicGroup contains updates that become valid after applying further
|
|
|
|
// updates.
|
|
|
|
|
|
|
|
// `SetupTestSyncPoints()` creates sync points that assume there is only one
|
|
|
|
// AtomicGroup, which is not the case in this test.
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
SyncPoint::GetInstance()->ClearAllCallBacks();
|
|
|
|
|
|
|
|
std::vector<SstInfo> file_infos;
|
|
|
|
for (int cfid = 0; cfid < kNumColumnFamilies; cfid++) {
|
|
|
|
int file_number = 10 + cfid;
|
|
|
|
file_infos.emplace_back(file_number, column_families_[cfid].name,
|
|
|
|
"" /* key */, 0 /* level */,
|
|
|
|
file_number /* epoch_number */);
|
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<FileMetaData> file_metas;
|
|
|
|
CreateDummyTableFiles(file_infos, &file_metas);
|
|
|
|
|
|
|
|
edits_.clear();
|
|
|
|
for (int cfid = 0; cfid < kNumColumnFamilies; cfid++) {
|
|
|
|
if (cfid == kNumColumnFamilies - 1) {
|
|
|
|
// Corrupt the number of the last file.
|
|
|
|
file_metas[cfid].fd.packed_number_and_path_id =
|
|
|
|
PackFileNumberAndPathId(20 /* number */, 0 /* path_id */);
|
|
|
|
}
|
|
|
|
edits_.emplace_back();
|
|
|
|
edits_.back().SetColumnFamily(cfid);
|
|
|
|
edits_.back().AddFile(0 /* level */, file_metas[cfid]);
|
|
|
|
edits_.back().SetLastSequence(++last_seqno_);
|
|
|
|
edits_.back().MarkAtomicGroup(kNumColumnFamilies - 1 -
|
|
|
|
cfid /* remaining_entries */);
|
|
|
|
}
|
|
|
|
AddNewEditsToLog(kNumColumnFamilies);
|
|
|
|
|
|
|
|
{
|
|
|
|
// Delete the file with the corrupted number.
|
|
|
|
VersionEdit fixup_edit;
|
|
|
|
fixup_edit.SetColumnFamily(kNumColumnFamilies - 1);
|
|
|
|
fixup_edit.DeleteFile(0 /* level */, 20 /* number */);
|
|
|
|
assert(log_writer_.get() != nullptr);
|
|
|
|
std::string record;
|
|
|
|
ASSERT_TRUE(fixup_edit.EncodeTo(&record, 0 /* ts_sz */));
|
|
|
|
ASSERT_OK(log_writer_->AddRecord(WriteOptions(), record));
|
|
|
|
|
|
|
|
// Throw in an impossible AtomicGroup afterwards for extra challenge.
|
|
|
|
VersionEdit broken_edit;
|
|
|
|
broken_edit.SetColumnFamily(0 /* column_family_id */);
|
|
|
|
file_metas[0].fd.packed_number_and_path_id =
|
|
|
|
PackFileNumberAndPathId(30 /* number */, 0 /* path_id */);
|
|
|
|
broken_edit.AddFile(0 /* level */, file_metas[0]);
|
|
|
|
broken_edit.SetLastSequence(++last_seqno_);
|
|
|
|
broken_edit.MarkAtomicGroup(0 /* remaining_entries */);
|
|
|
|
record.clear();
|
|
|
|
ASSERT_TRUE(broken_edit.EncodeTo(&record, 0 /* ts_sz */));
|
|
|
|
ASSERT_OK(log_writer_->AddRecord(WriteOptions(), record));
|
|
|
|
assert(log_writer_.get() != nullptr);
|
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
bool has_missing_table_file = false;
|
|
|
|
ASSERT_OK(versions_->TryRecover(column_families_, false /* read_only */,
|
|
|
|
{DescriptorFileName(1 /* number */)},
|
|
|
|
nullptr /* db_id */,
|
|
|
|
&has_missing_table_file));
|
|
|
|
ASSERT_TRUE(has_missing_table_file);
|
|
|
|
}
|
|
|
|
std::vector<uint64_t> all_table_files;
|
|
|
|
std::vector<uint64_t> all_blob_files;
|
|
|
|
versions_->AddLiveFiles(&all_table_files, &all_blob_files);
|
|
|
|
ASSERT_EQ(file_metas.size() - 1, all_table_files.size());
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(AtomicGroupBestEffortRecoveryTest, HandleAtomicGroupUpdatesInvalid) {
|
|
|
|
// One AtomicGroup contains updates that never become valid.
|
|
|
|
std::vector<SstInfo> file_infos;
|
|
|
|
for (int cfid = 0; cfid < kNumColumnFamilies; cfid++) {
|
|
|
|
int file_number = 10 + cfid;
|
|
|
|
file_infos.emplace_back(file_number, column_families_[cfid].name,
|
|
|
|
"" /* key */, 0 /* level */,
|
|
|
|
file_number /* epoch_number */);
|
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<FileMetaData> file_metas;
|
|
|
|
CreateDummyTableFiles(file_infos, &file_metas);
|
|
|
|
|
|
|
|
edits_.clear();
|
|
|
|
for (int cfid = 0; cfid < kNumColumnFamilies; cfid++) {
|
|
|
|
if (cfid == kNumColumnFamilies - 1) {
|
|
|
|
// Corrupt the number of the last file.
|
|
|
|
file_metas[cfid].fd.packed_number_and_path_id =
|
|
|
|
PackFileNumberAndPathId(20 /* number */, 0 /* path_id */);
|
|
|
|
}
|
|
|
|
edits_.emplace_back();
|
|
|
|
edits_.back().SetColumnFamily(cfid);
|
|
|
|
edits_.back().AddFile(0 /* level */, file_metas[cfid]);
|
|
|
|
edits_.back().SetLastSequence(++last_seqno_);
|
|
|
|
edits_.back().MarkAtomicGroup(kNumColumnFamilies - 1 -
|
|
|
|
cfid /* remaining_entries */);
|
|
|
|
}
|
|
|
|
AddNewEditsToLog(kNumColumnFamilies);
|
|
|
|
|
|
|
|
{
|
|
|
|
bool has_missing_table_file = false;
|
|
|
|
ASSERT_OK(versions_->TryRecover(column_families_, false /* read_only */,
|
|
|
|
{DescriptorFileName(1 /* number */)},
|
|
|
|
nullptr /* db_id */,
|
|
|
|
&has_missing_table_file));
|
|
|
|
ASSERT_TRUE(has_missing_table_file);
|
|
|
|
}
|
|
|
|
std::vector<uint64_t> all_table_files;
|
|
|
|
std::vector<uint64_t> all_blob_files;
|
|
|
|
versions_->AddLiveFiles(&all_table_files, &all_blob_files);
|
|
|
|
ASSERT_TRUE(all_table_files.empty());
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(AtomicGroupBestEffortRecoveryTest,
|
|
|
|
HandleAtomicGroupUpdatesValidTooLate) {
|
|
|
|
// One AtomicGroup contains updates that become valid after the next
|
|
|
|
// AtomicGroup is reached, which is too late.
|
|
|
|
|
|
|
|
// `SetupTestSyncPoints()` creates sync points that assume there is only one
|
|
|
|
// AtomicGroup, which is not the case in this test.
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
SyncPoint::GetInstance()->ClearAllCallBacks();
|
|
|
|
|
|
|
|
std::vector<SstInfo> file_infos;
|
|
|
|
for (int cfid = 0; cfid < kNumColumnFamilies; cfid++) {
|
|
|
|
int file_number = 10 + cfid;
|
|
|
|
file_infos.emplace_back(file_number, column_families_[cfid].name,
|
|
|
|
"" /* key */, 0 /* level */,
|
|
|
|
file_number /* epoch_number */);
|
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<FileMetaData> file_metas;
|
|
|
|
CreateDummyTableFiles(file_infos, &file_metas);
|
|
|
|
|
|
|
|
edits_.clear();
|
|
|
|
for (int cfid = 0; cfid < kNumColumnFamilies; cfid++) {
|
|
|
|
if (cfid == kNumColumnFamilies - 1) {
|
|
|
|
// Corrupt the number of the last file.
|
|
|
|
file_metas[cfid].fd.packed_number_and_path_id =
|
|
|
|
PackFileNumberAndPathId(20 /* number */, 0 /* path_id */);
|
|
|
|
}
|
|
|
|
edits_.emplace_back();
|
|
|
|
edits_.back().SetColumnFamily(cfid);
|
|
|
|
edits_.back().AddFile(0 /* level */, file_metas[cfid]);
|
|
|
|
edits_.back().SetLastSequence(++last_seqno_);
|
|
|
|
edits_.back().MarkAtomicGroup(kNumColumnFamilies - 1 -
|
|
|
|
cfid /* remaining_entries */);
|
|
|
|
}
|
|
|
|
AddNewEditsToLog(kNumColumnFamilies);
|
|
|
|
|
|
|
|
{
|
|
|
|
// Delete the file with the corrupted number. But bundle it in an
|
|
|
|
// AtomicGroup with an update that can never be applied.
|
|
|
|
VersionEdit broken_edit;
|
|
|
|
broken_edit.SetColumnFamily(0 /* column_family_id */);
|
|
|
|
file_metas[0].fd.packed_number_and_path_id =
|
|
|
|
PackFileNumberAndPathId(30 /* number */, 0 /* path_id */);
|
|
|
|
broken_edit.AddFile(0 /* level */, file_metas[0]);
|
|
|
|
broken_edit.SetLastSequence(++last_seqno_);
|
|
|
|
broken_edit.MarkAtomicGroup(1 /* remaining_entries */);
|
|
|
|
std::string record;
|
|
|
|
ASSERT_TRUE(broken_edit.EncodeTo(&record, 0 /* ts_sz */));
|
|
|
|
ASSERT_OK(log_writer_->AddRecord(WriteOptions(), record));
|
|
|
|
|
|
|
|
VersionEdit fixup_edit;
|
|
|
|
fixup_edit.SetColumnFamily(kNumColumnFamilies - 1);
|
|
|
|
fixup_edit.DeleteFile(0 /* level */, 20 /* number */);
|
|
|
|
fixup_edit.MarkAtomicGroup(0 /* remaining_entries */);
|
|
|
|
record.clear();
|
|
|
|
ASSERT_TRUE(fixup_edit.EncodeTo(&record, 0 /* ts_sz */));
|
|
|
|
ASSERT_OK(log_writer_->AddRecord(WriteOptions(), record));
|
|
|
|
assert(log_writer_.get() != nullptr);
|
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
bool has_missing_table_file = false;
|
|
|
|
ASSERT_OK(versions_->TryRecover(column_families_, false /* read_only */,
|
|
|
|
{DescriptorFileName(1 /* number */)},
|
|
|
|
nullptr /* db_id */,
|
|
|
|
&has_missing_table_file));
|
|
|
|
ASSERT_TRUE(has_missing_table_file);
|
|
|
|
}
|
|
|
|
std::vector<uint64_t> all_table_files;
|
|
|
|
std::vector<uint64_t> all_blob_files;
|
|
|
|
versions_->AddLiveFiles(&all_table_files, &all_blob_files);
|
|
|
|
ASSERT_TRUE(all_table_files.empty());
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(AtomicGroupBestEffortRecoveryTest,
|
|
|
|
HandleAtomicGroupUpdatesInDuplicateInvalid) {
|
|
|
|
// One AtomicGroup has multiple updates for the same CF. One of the earlier
|
|
|
|
// updates for this CF can lead to a valid state if applied. But the last
|
|
|
|
// update for this CF is invalid so the AtomicGroup must not be recovered.
|
|
|
|
std::vector<SstInfo> file_infos;
|
|
|
|
for (int cfid = 0; cfid < kNumColumnFamilies; cfid++) {
|
|
|
|
int file_number = 10 + cfid;
|
|
|
|
file_infos.emplace_back(file_number, column_families_[cfid].name,
|
|
|
|
"" /* key */, 0 /* level */,
|
|
|
|
file_number /* epoch_number */);
|
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<FileMetaData> file_metas;
|
|
|
|
CreateDummyTableFiles(file_infos, &file_metas);
|
|
|
|
|
|
|
|
edits_.clear();
|
|
|
|
for (int cfid = 0; cfid < kNumColumnFamilies; cfid++) {
|
|
|
|
edits_.emplace_back();
|
|
|
|
edits_.back().SetColumnFamily(cfid);
|
|
|
|
edits_.back().AddFile(0 /* level */, file_metas[cfid]);
|
|
|
|
edits_.back().SetLastSequence(++last_seqno_);
|
|
|
|
edits_.back().MarkAtomicGroup(kNumColumnFamilies -
|
|
|
|
cfid /* remaining_entries */);
|
|
|
|
}
|
|
|
|
// Here is the unrecoverable update.
|
|
|
|
edits_.emplace_back();
|
|
|
|
edits_.back().SetColumnFamily(0 /* column_family_id */);
|
|
|
|
file_metas[0].fd.packed_number_and_path_id =
|
|
|
|
PackFileNumberAndPathId(20 /* number */, 0 /* path_id */);
|
|
|
|
edits_.back().AddFile(0 /* level */, file_metas[0]);
|
|
|
|
edits_.back().SetLastSequence(++last_seqno_);
|
|
|
|
edits_.back().MarkAtomicGroup(0 /* remaining_entries */);
|
|
|
|
AddNewEditsToLog(kNumColumnFamilies + 1);
|
|
|
|
|
|
|
|
{
|
|
|
|
bool has_missing_table_file = false;
|
|
|
|
ASSERT_OK(versions_->TryRecover(column_families_, false /* read_only */,
|
|
|
|
{DescriptorFileName(1 /* number */)},
|
|
|
|
nullptr /* db_id */,
|
|
|
|
&has_missing_table_file));
|
|
|
|
ASSERT_TRUE(has_missing_table_file);
|
|
|
|
}
|
|
|
|
std::vector<uint64_t> all_table_files;
|
|
|
|
std::vector<uint64_t> all_blob_files;
|
|
|
|
versions_->AddLiveFiles(&all_table_files, &all_blob_files);
|
|
|
|
ASSERT_TRUE(all_table_files.empty());
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(AtomicGroupBestEffortRecoveryTest,
|
|
|
|
HandleAtomicGroupMadeWholeByDeletingCf) {
|
|
|
|
// One AtomicGroup contains an update that becomes valid when its column
|
|
|
|
// family is deleted, making it irrelevant.
|
|
|
|
std::vector<SstInfo> file_infos;
|
|
|
|
for (int cfid = 0; cfid < kNumColumnFamilies; cfid++) {
|
|
|
|
int file_number = 10 + cfid;
|
|
|
|
file_infos.emplace_back(file_number, column_families_[cfid].name,
|
|
|
|
"" /* key */, 0 /* level */,
|
|
|
|
file_number /* epoch_number */);
|
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<FileMetaData> file_metas;
|
|
|
|
CreateDummyTableFiles(file_infos, &file_metas);
|
|
|
|
|
|
|
|
edits_.clear();
|
|
|
|
for (int cfid = 0; cfid < kNumColumnFamilies; cfid++) {
|
|
|
|
if (cfid == kNumColumnFamilies - 1) {
|
|
|
|
// Corrupt the number of the last file.
|
|
|
|
file_metas[cfid].fd.packed_number_and_path_id =
|
|
|
|
PackFileNumberAndPathId(20 /* number */, 0 /* path_id */);
|
|
|
|
}
|
|
|
|
edits_.emplace_back();
|
|
|
|
edits_.back().SetColumnFamily(cfid);
|
|
|
|
edits_.back().AddFile(0 /* level */, file_metas[cfid]);
|
|
|
|
edits_.back().SetLastSequence(++last_seqno_);
|
|
|
|
edits_.back().MarkAtomicGroup(kNumColumnFamilies - 1 -
|
|
|
|
cfid /* remaining_entries */);
|
|
|
|
}
|
|
|
|
AddNewEditsToLog(kNumColumnFamilies);
|
|
|
|
|
|
|
|
{
|
|
|
|
// Delete the column family with the corrupted file number.
|
|
|
|
VersionEdit fixup_edit;
|
|
|
|
fixup_edit.DropColumnFamily();
|
|
|
|
fixup_edit.SetColumnFamily(kNumColumnFamilies - 1);
|
|
|
|
assert(log_writer_.get() != nullptr);
|
|
|
|
std::string record;
|
|
|
|
ASSERT_TRUE(fixup_edit.EncodeTo(&record, 0 /* ts_sz */));
|
|
|
|
ASSERT_OK(log_writer_->AddRecord(WriteOptions(), record));
|
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
bool has_missing_table_file = false;
|
|
|
|
ASSERT_OK(versions_->TryRecover(column_families_, false /* read_only */,
|
|
|
|
{DescriptorFileName(1 /* number */)},
|
|
|
|
nullptr /* db_id */,
|
|
|
|
&has_missing_table_file));
|
|
|
|
ASSERT_FALSE(has_missing_table_file);
|
|
|
|
}
|
|
|
|
std::vector<uint64_t> all_table_files;
|
|
|
|
std::vector<uint64_t> all_blob_files;
|
|
|
|
versions_->AddLiveFiles(&all_table_files, &all_blob_files);
|
|
|
|
ASSERT_EQ(file_metas.size() - 1, all_table_files.size());
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(AtomicGroupBestEffortRecoveryTest,
|
|
|
|
HandleAtomicGroupMadeWholeAfterNewCf) {
|
|
|
|
// One AtomicGroup contains updates that become valid after a new column
|
|
|
|
// family is added.
|
|
|
|
std::vector<SstInfo> file_infos;
|
|
|
|
for (int cfid = 0; cfid < kNumColumnFamilies; cfid++) {
|
|
|
|
int file_number = 10 + cfid;
|
|
|
|
file_infos.emplace_back(file_number, column_families_[cfid].name,
|
|
|
|
"" /* key */, 0 /* level */,
|
|
|
|
file_number /* epoch_number */);
|
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<FileMetaData> file_metas;
|
|
|
|
CreateDummyTableFiles(file_infos, &file_metas);
|
|
|
|
|
|
|
|
edits_.clear();
|
|
|
|
for (int cfid = 0; cfid < kNumColumnFamilies; cfid++) {
|
|
|
|
if (cfid == kNumColumnFamilies - 1) {
|
|
|
|
// Corrupt the number of the last file.
|
|
|
|
file_metas[cfid].fd.packed_number_and_path_id =
|
|
|
|
PackFileNumberAndPathId(20 /* number */, 0 /* path_id */);
|
|
|
|
}
|
|
|
|
edits_.emplace_back();
|
|
|
|
edits_.back().SetColumnFamily(cfid);
|
|
|
|
edits_.back().AddFile(0 /* level */, file_metas[cfid]);
|
|
|
|
edits_.back().SetLastSequence(++last_seqno_);
|
|
|
|
edits_.back().MarkAtomicGroup(kNumColumnFamilies - 1 -
|
|
|
|
cfid /* remaining_entries */);
|
|
|
|
}
|
|
|
|
AddNewEditsToLog(kNumColumnFamilies);
|
|
|
|
|
|
|
|
{
|
|
|
|
// Add a new CF.
|
|
|
|
VersionEdit add_cf_edit;
|
|
|
|
add_cf_edit.AddColumnFamily("extra_cf");
|
|
|
|
add_cf_edit.SetColumnFamily(kNumColumnFamilies);
|
|
|
|
std::string record;
|
|
|
|
ASSERT_TRUE(add_cf_edit.EncodeTo(&record, 0 /* ts_sz */));
|
|
|
|
ASSERT_OK(log_writer_->AddRecord(WriteOptions(), record));
|
|
|
|
|
|
|
|
// Have the new CF refer to a non-existent file for an extra challenge.
|
|
|
|
VersionEdit broken_edit;
|
|
|
|
broken_edit.SetColumnFamily(kNumColumnFamilies);
|
|
|
|
file_metas[0].fd.packed_number_and_path_id =
|
|
|
|
PackFileNumberAndPathId(30 /* number */, 0 /* path_id */);
|
|
|
|
broken_edit.AddFile(0 /* level */, file_metas[0]);
|
|
|
|
broken_edit.SetLastSequence(++last_seqno_);
|
|
|
|
record.clear();
|
|
|
|
ASSERT_TRUE(broken_edit.EncodeTo(&record, 0 /* ts_sz */));
|
|
|
|
ASSERT_OK(log_writer_->AddRecord(WriteOptions(), record));
|
|
|
|
|
|
|
|
// This fixes up the first of the two non-existent file references.
|
|
|
|
VersionEdit fixup_edit;
|
|
|
|
fixup_edit.SetColumnFamily(kNumColumnFamilies - 1);
|
|
|
|
fixup_edit.DeleteFile(0 /* level */, 20 /* number */);
|
|
|
|
record.clear();
|
|
|
|
ASSERT_TRUE(fixup_edit.EncodeTo(&record, 0 /* ts_sz */));
|
|
|
|
ASSERT_OK(log_writer_->AddRecord(WriteOptions(), record));
|
|
|
|
assert(log_writer_.get() != nullptr);
|
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
bool has_missing_table_file = false;
|
|
|
|
std::vector<ColumnFamilyDescriptor> column_families = column_families_;
|
|
|
|
column_families.emplace_back("extra_cf", cf_options_);
|
|
|
|
ASSERT_OK(versions_->TryRecover(column_families, false /* read_only */,
|
|
|
|
{DescriptorFileName(1 /* number */)},
|
|
|
|
nullptr /* db_id */,
|
|
|
|
&has_missing_table_file));
|
|
|
|
ASSERT_TRUE(has_missing_table_file);
|
|
|
|
}
|
|
|
|
std::vector<uint64_t> all_table_files;
|
|
|
|
std::vector<uint64_t> all_blob_files;
|
|
|
|
versions_->AddLiveFiles(&all_table_files, &all_blob_files);
|
|
|
|
ASSERT_EQ(file_metas.size() - 1, all_table_files.size());
|
|
|
|
}
|
|
|
|
|
2018-12-13 23:10:16 +00:00
|
|
|
class VersionSetTestDropOneCF : public VersionSetTestBase,
|
|
|
|
public testing::TestWithParam<std::string> {
|
|
|
|
public:
|
2020-03-21 02:17:54 +00:00
|
|
|
VersionSetTestDropOneCF()
|
|
|
|
: VersionSetTestBase("version_set_test_drop_one_cf") {}
|
2018-12-13 23:10:16 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
// This test simulates the following execution sequence
|
|
|
|
// Time thread1 bg_flush_thr
|
|
|
|
// | Prepare version edits (e1,e2,e3) for atomic
|
|
|
|
// | flush cf1, cf2, cf3
|
|
|
|
// | Enqueue e to drop cfi
|
|
|
|
// | to manifest_writers_
|
|
|
|
// | Enqueue (e1,e2,e3) to manifest_writers_
|
|
|
|
// |
|
|
|
|
// | Apply e,
|
|
|
|
// | cfi.IsDropped() is true
|
|
|
|
// | Apply (e1,e2,e3),
|
|
|
|
// | since cfi.IsDropped() == true, we need to
|
|
|
|
// | drop ei and write the rest to MANIFEST.
|
|
|
|
// V
|
|
|
|
//
|
|
|
|
// Repeat the test for i = 1, 2, 3 to simulate dropping the first, middle and
|
|
|
|
// last column family in an atomic group.
|
|
|
|
TEST_P(VersionSetTestDropOneCF, HandleDroppedColumnFamilyInAtomicGroup) {
|
2023-04-21 16:07:18 +00:00
|
|
|
const ReadOptions read_options;
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
const WriteOptions write_options;
|
2023-04-21 16:07:18 +00:00
|
|
|
|
2018-12-13 23:10:16 +00:00
|
|
|
std::vector<ColumnFamilyDescriptor> column_families;
|
|
|
|
SequenceNumber last_seqno;
|
|
|
|
std::unique_ptr<log::Writer> log_writer;
|
|
|
|
PrepareManifest(&column_families, &last_seqno, &log_writer);
|
2024-08-24 02:49:25 +00:00
|
|
|
CreateCurrentFile();
|
2018-12-13 23:10:16 +00:00
|
|
|
|
|
|
|
EXPECT_OK(versions_->Recover(column_families, false /* read_only */));
|
|
|
|
EXPECT_EQ(column_families.size(),
|
|
|
|
versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
|
|
|
|
|
|
|
|
const int kAtomicGroupSize = 3;
|
|
|
|
const std::vector<std::string> non_default_cf_names = {
|
|
|
|
kColumnFamilyName1, kColumnFamilyName2, kColumnFamilyName3};
|
|
|
|
|
|
|
|
// Drop one column family
|
|
|
|
VersionEdit drop_cf_edit;
|
|
|
|
drop_cf_edit.DropColumnFamily();
|
|
|
|
const std::string cf_to_drop_name(GetParam());
|
|
|
|
auto cfd_to_drop =
|
|
|
|
versions_->GetColumnFamilySet()->GetColumnFamily(cf_to_drop_name);
|
|
|
|
ASSERT_NE(nullptr, cfd_to_drop);
|
2019-01-04 04:53:52 +00:00
|
|
|
// Increase its refcount because cfd_to_drop is used later, and we need to
|
|
|
|
// prevent it from being deleted.
|
|
|
|
cfd_to_drop->Ref();
|
2018-12-13 23:10:16 +00:00
|
|
|
drop_cf_edit.SetColumnFamily(cfd_to_drop->GetID());
|
|
|
|
mutex_.Lock();
|
2024-08-24 02:49:25 +00:00
|
|
|
Status s = versions_->LogAndApply(
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
cfd_to_drop, *cfd_to_drop->GetLatestMutableCFOptions(), read_options,
|
|
|
|
write_options, &drop_cf_edit, &mutex_, nullptr);
|
2018-12-13 23:10:16 +00:00
|
|
|
mutex_.Unlock();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
std::vector<VersionEdit> edits(kAtomicGroupSize);
|
|
|
|
uint32_t remaining = kAtomicGroupSize;
|
|
|
|
size_t i = 0;
|
|
|
|
autovector<ColumnFamilyData*> cfds;
|
|
|
|
autovector<const MutableCFOptions*> mutable_cf_options_list;
|
|
|
|
autovector<autovector<VersionEdit*>> edit_lists;
|
|
|
|
for (const auto& cf_name : non_default_cf_names) {
|
|
|
|
auto cfd = (cf_name != cf_to_drop_name)
|
|
|
|
? versions_->GetColumnFamilySet()->GetColumnFamily(cf_name)
|
|
|
|
: cfd_to_drop;
|
|
|
|
ASSERT_NE(nullptr, cfd);
|
|
|
|
cfds.push_back(cfd);
|
|
|
|
mutable_cf_options_list.emplace_back(cfd->GetLatestMutableCFOptions());
|
|
|
|
edits[i].SetColumnFamily(cfd->GetID());
|
|
|
|
edits[i].SetLogNumber(0);
|
|
|
|
edits[i].SetNextFile(2);
|
|
|
|
edits[i].MarkAtomicGroup(--remaining);
|
|
|
|
edits[i].SetLastSequence(last_seqno++);
|
|
|
|
autovector<VersionEdit*> tmp_edits;
|
|
|
|
tmp_edits.push_back(&edits[i]);
|
|
|
|
edit_lists.emplace_back(tmp_edits);
|
|
|
|
++i;
|
|
|
|
}
|
|
|
|
int called = 0;
|
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
|
|
|
SyncPoint::GetInstance()->ClearAllCallBacks();
|
|
|
|
SyncPoint::GetInstance()->SetCallBack(
|
|
|
|
"VersionSet::ProcessManifestWrites:CheckOneAtomicGroup", [&](void* arg) {
|
|
|
|
std::vector<VersionEdit*>* tmp_edits =
|
Prefer static_cast in place of most reinterpret_cast (#12308)
Summary:
The following are risks associated with pointer-to-pointer reinterpret_cast:
* Can produce the "wrong result" (crash or memory corruption). IIRC, in theory this can happen for any up-cast or down-cast for a non-standard-layout type, though in practice would only happen for multiple inheritance cases (where the base class pointer might be "inside" the derived object). We don't use multiple inheritance a lot, but we do.
* Can mask useful compiler errors upon code change, including converting between unrelated pointer types that you are expecting to be related, and converting between pointer and scalar types unintentionally.
I can only think of some obscure cases where static_cast could be troublesome when it compiles as a replacement:
* Going through `void*` could plausibly cause unnecessary or broken pointer arithmetic. Suppose we have
`struct Derived: public Base1, public Base2`. If we have `Derived*` -> `void*` -> `Base2*` -> `Derived*` through reinterpret casts, this could plausibly work (though technical UB) assuming the `Base2*` is not dereferenced. Changing to static cast could introduce breaking pointer arithmetic.
* Unnecessary (but safe) pointer arithmetic could arise in a case like `Derived*` -> `Base2*` -> `Derived*` where before the Base2 pointer might not have been dereferenced. This could potentially affect performance.
With some light scripting, I tried replacing pointer-to-pointer reinterpret_casts with static_cast and kept the cases that still compile. Most occurrences of reinterpret_cast have successfully been changed (except for java/ and third-party/). 294 changed, 257 remain.
A couple of related interventions included here:
* Previously Cache::Handle was not actually derived from in the implementations and just used as a `void*` stand-in with reinterpret_cast. Now there is a relationship to allow static_cast. In theory, this could introduce pointer arithmetic (as described above) but is unlikely without multiple inheritance AND non-empty Cache::Handle.
* Remove some unnecessary casts to void* as this is allowed to be implicit (for better or worse).
Most of the remaining reinterpret_casts are for converting to/from raw bytes of objects. We could consider better idioms for these patterns in follow-up work.
I wish there were a way to implement a template variant of static_cast that would only compile if no pointer arithmetic is generated, but best I can tell, this is not possible. AFAIK the best you could do is a dynamic check that the void* conversion after the static cast is unchanged.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12308
Test Plan: existing tests, CI
Reviewed By: ltamasi
Differential Revision: D53204947
Pulled By: pdillinger
fbshipit-source-id: 9de23e618263b0d5b9820f4e15966876888a16e2
2024-02-07 18:44:11 +00:00
|
|
|
static_cast<std::vector<VersionEdit*>*>(arg);
|
2018-12-13 23:10:16 +00:00
|
|
|
EXPECT_EQ(kAtomicGroupSize - 1, tmp_edits->size());
|
|
|
|
for (const auto e : *tmp_edits) {
|
|
|
|
bool found = false;
|
|
|
|
for (const auto& e2 : edits) {
|
|
|
|
if (&e2 == e) {
|
|
|
|
found = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
ASSERT_TRUE(found);
|
|
|
|
}
|
|
|
|
++called;
|
|
|
|
});
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
|
|
|
mutex_.Lock();
|
2023-04-21 16:07:18 +00:00
|
|
|
s = versions_->LogAndApply(cfds, mutable_cf_options_list, read_options,
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
write_options, edit_lists, &mutex_, nullptr);
|
2018-12-13 23:10:16 +00:00
|
|
|
mutex_.Unlock();
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_EQ(1, called);
|
2020-12-11 19:17:11 +00:00
|
|
|
cfd_to_drop->UnrefAndTryDelete();
|
2018-12-13 23:10:16 +00:00
|
|
|
}
|
|
|
|
|
2020-06-03 22:53:09 +00:00
|
|
|
INSTANTIATE_TEST_CASE_P(
|
2018-12-13 23:10:16 +00:00
|
|
|
AtomicGroup, VersionSetTestDropOneCF,
|
|
|
|
testing::Values(VersionSetTestBase::kColumnFamilyName1,
|
|
|
|
VersionSetTestBase::kColumnFamilyName2,
|
|
|
|
VersionSetTestBase::kColumnFamilyName3));
|
2020-03-21 02:17:54 +00:00
|
|
|
|
|
|
|
class EmptyDefaultCfNewManifest : public VersionSetTestBase,
|
|
|
|
public testing::Test {
|
|
|
|
public:
|
|
|
|
EmptyDefaultCfNewManifest() : VersionSetTestBase("version_set_new_db_test") {}
|
|
|
|
// Emulate DBImpl::NewDB()
|
|
|
|
void PrepareManifest(std::vector<ColumnFamilyDescriptor>* /*column_families*/,
|
|
|
|
SequenceNumber* /*last_seqno*/,
|
|
|
|
std::unique_ptr<log::Writer>* log_writer) override {
|
|
|
|
assert(log_writer != nullptr);
|
|
|
|
VersionEdit new_db;
|
|
|
|
new_db.SetLogNumber(0);
|
|
|
|
const std::string manifest_path = DescriptorFileName(dbname_, 1);
|
2021-01-29 06:08:46 +00:00
|
|
|
const auto& fs = env_->GetFileSystem();
|
|
|
|
std::unique_ptr<WritableFileWriter> file_writer;
|
|
|
|
Status s = WritableFileWriter::Create(
|
|
|
|
fs, manifest_path, fs->OptimizeForManifestWrite(env_options_),
|
|
|
|
&file_writer, nullptr);
|
2020-03-21 02:17:54 +00:00
|
|
|
ASSERT_OK(s);
|
|
|
|
log_writer->reset(new log::Writer(std::move(file_writer), 0, true));
|
|
|
|
std::string record;
|
|
|
|
ASSERT_TRUE(new_db.EncodeTo(&record));
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
s = (*log_writer)->AddRecord(WriteOptions(), record);
|
2020-03-21 02:17:54 +00:00
|
|
|
ASSERT_OK(s);
|
|
|
|
// Create new column family
|
|
|
|
VersionEdit new_cf;
|
|
|
|
new_cf.AddColumnFamily(VersionSetTestBase::kColumnFamilyName1);
|
|
|
|
new_cf.SetColumnFamily(1);
|
|
|
|
new_cf.SetLastSequence(2);
|
|
|
|
new_cf.SetNextFile(2);
|
|
|
|
record.clear();
|
|
|
|
ASSERT_TRUE(new_cf.EncodeTo(&record));
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
s = (*log_writer)->AddRecord(WriteOptions(), record);
|
2020-03-21 02:17:54 +00:00
|
|
|
ASSERT_OK(s);
|
|
|
|
}
|
|
|
|
|
|
|
|
protected:
|
|
|
|
bool write_dbid_to_manifest_ = false;
|
|
|
|
std::unique_ptr<log::Writer> log_writer_;
|
|
|
|
};
|
|
|
|
|
|
|
|
// Create db, create column family. Cf creation will switch to a new MANIFEST.
|
|
|
|
// Then reopen db, trying to recover.
|
|
|
|
TEST_F(EmptyDefaultCfNewManifest, Recover) {
|
|
|
|
PrepareManifest(nullptr, nullptr, &log_writer_);
|
|
|
|
log_writer_.reset();
|
2024-08-24 02:49:25 +00:00
|
|
|
CreateCurrentFile();
|
2020-03-21 02:17:54 +00:00
|
|
|
std::string manifest_path;
|
|
|
|
VerifyManifest(&manifest_path);
|
|
|
|
std::vector<ColumnFamilyDescriptor> column_families;
|
|
|
|
column_families.emplace_back(kDefaultColumnFamilyName, cf_options_);
|
|
|
|
column_families.emplace_back(VersionSetTestBase::kColumnFamilyName1,
|
|
|
|
cf_options_);
|
|
|
|
std::string db_id;
|
|
|
|
bool has_missing_table_file = false;
|
2024-08-24 02:49:25 +00:00
|
|
|
Status s = versions_->TryRecoverFromOneManifest(
|
2020-03-21 02:17:54 +00:00
|
|
|
manifest_path, column_families, false, &db_id, &has_missing_table_file);
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_FALSE(has_missing_table_file);
|
|
|
|
}
|
|
|
|
|
|
|
|
class VersionSetTestEmptyDb
|
|
|
|
: public VersionSetTestBase,
|
|
|
|
public testing::TestWithParam<
|
|
|
|
std::tuple<bool, bool, std::vector<std::string>>> {
|
|
|
|
public:
|
|
|
|
static const std::string kUnknownColumnFamilyName;
|
|
|
|
VersionSetTestEmptyDb() : VersionSetTestBase("version_set_test_empty_db") {}
|
|
|
|
|
|
|
|
protected:
|
|
|
|
void PrepareManifest(std::vector<ColumnFamilyDescriptor>* /*column_families*/,
|
|
|
|
SequenceNumber* /*last_seqno*/,
|
|
|
|
std::unique_ptr<log::Writer>* log_writer) override {
|
|
|
|
assert(nullptr != log_writer);
|
|
|
|
VersionEdit new_db;
|
|
|
|
if (db_options_.write_dbid_to_manifest) {
|
2024-08-24 02:49:25 +00:00
|
|
|
ASSERT_OK(SetIdentityFile(WriteOptions(), env_, dbname_,
|
|
|
|
Temperature::kUnknown));
|
Fix many tests to run with MEM_ENV and ENCRYPTED_ENV; Introduce a MemoryFileSystem class (#7566)
Summary:
This PR does a few things:
1. The MockFileSystem class was split out from the MockEnv. This change would theoretically allow a MockFileSystem to be used by other Environments as well (if we created a means of constructing one). The MockFileSystem implements a FileSystem in its entirety and does not rely on any Wrapper implementation.
2. Make the RocksDB test suite work when MOCK_ENV=1 and ENCRYPTED_ENV=1 are set. To accomplish this, a few things were needed:
- The tests that tried to use the "wrong" environment (Env::Default() instead of env_) were updated
- The MockFileSystem was changed to support the features it was missing or mishandled (such as recursively deleting files in a directory or supporting renaming of a directory).
3. Updated the test framework to have a ROCKSDB_GTEST_SKIP macro. This can be used to flag tests that are skipped. Currently, this defaults to doing nothing (marks the test as SUCCESS) but will mark the tests as SKIPPED when RocksDB is upgraded to a version of gtest that supports this (gtest-1.10).
I have run a full "make check" with MEM_ENV, ENCRYPTED_ENV, both, and neither under both MacOS and RedHat. A few tests were disabled/skipped for the MEM/ENCRYPTED cases. The error_handler_fs_test fails/hangs for MEM_ENV (presumably a timing problem) and I will introduce another PR/issue to track that problem. (I will also push a change to disable those tests soon). There is one more test in DBTest2 that also fails which I need to investigate or skip before this PR is merged.
Theoretically, this PR should also allow the test suite to run against an Env loaded from the registry, though I do not have one to try it with currently.
Finally, once this is accepted, it would be nice if there was a CircleCI job to run these tests on a checkin so this effort does not become stale. I do not know how to do that, so if someone could write that job, it would be appreciated :)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7566
Reviewed By: zhichao-cao
Differential Revision: D24408980
Pulled By: jay-zhuang
fbshipit-source-id: 911b1554a4d0da06fd51feca0c090a4abdcb4a5f
2020-10-27 17:31:34 +00:00
|
|
|
DBOptions tmp_db_options;
|
|
|
|
tmp_db_options.env = env_;
|
|
|
|
std::unique_ptr<DBImpl> impl(new DBImpl(tmp_db_options, dbname_));
|
2020-03-21 02:17:54 +00:00
|
|
|
std::string db_id;
|
2023-08-09 22:46:44 +00:00
|
|
|
ASSERT_OK(impl->GetDbIdentityFromIdentityFile(&db_id));
|
2020-03-21 02:17:54 +00:00
|
|
|
new_db.SetDBId(db_id);
|
|
|
|
}
|
|
|
|
const std::string manifest_path = DescriptorFileName(dbname_, 1);
|
2021-01-29 06:08:46 +00:00
|
|
|
const auto& fs = env_->GetFileSystem();
|
|
|
|
std::unique_ptr<WritableFileWriter> file_writer;
|
|
|
|
Status s = WritableFileWriter::Create(
|
|
|
|
fs, manifest_path, fs->OptimizeForManifestWrite(env_options_),
|
|
|
|
&file_writer, nullptr);
|
2020-03-21 02:17:54 +00:00
|
|
|
ASSERT_OK(s);
|
|
|
|
{
|
|
|
|
log_writer->reset(new log::Writer(std::move(file_writer), 0, false));
|
|
|
|
std::string record;
|
|
|
|
new_db.EncodeTo(&record);
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
s = (*log_writer)->AddRecord(WriteOptions(), record);
|
2020-03-21 02:17:54 +00:00
|
|
|
ASSERT_OK(s);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
std::unique_ptr<log::Writer> log_writer_;
|
|
|
|
};
|
|
|
|
|
|
|
|
const std::string VersionSetTestEmptyDb::kUnknownColumnFamilyName = "unknown";
|
|
|
|
|
|
|
|
TEST_P(VersionSetTestEmptyDb, OpenFromIncompleteManifest0) {
|
|
|
|
db_options_.write_dbid_to_manifest = std::get<0>(GetParam());
|
|
|
|
PrepareManifest(nullptr, nullptr, &log_writer_);
|
|
|
|
log_writer_.reset();
|
2024-08-24 02:49:25 +00:00
|
|
|
CreateCurrentFile();
|
2020-03-21 02:17:54 +00:00
|
|
|
|
|
|
|
std::string manifest_path;
|
|
|
|
VerifyManifest(&manifest_path);
|
|
|
|
|
|
|
|
bool read_only = std::get<1>(GetParam());
|
|
|
|
const std::vector<std::string> cf_names = std::get<2>(GetParam());
|
|
|
|
|
|
|
|
std::vector<ColumnFamilyDescriptor> column_families;
|
|
|
|
for (const auto& cf_name : cf_names) {
|
|
|
|
column_families.emplace_back(cf_name, cf_options_);
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string db_id;
|
|
|
|
bool has_missing_table_file = false;
|
2024-08-24 02:49:25 +00:00
|
|
|
Status s = versions_->TryRecoverFromOneManifest(
|
|
|
|
manifest_path, column_families, read_only, &db_id,
|
|
|
|
&has_missing_table_file);
|
2020-03-21 02:17:54 +00:00
|
|
|
auto iter =
|
|
|
|
std::find(cf_names.begin(), cf_names.end(), kDefaultColumnFamilyName);
|
|
|
|
if (iter == cf_names.end()) {
|
|
|
|
ASSERT_TRUE(s.IsInvalidArgument());
|
|
|
|
} else {
|
2022-01-08 02:08:50 +00:00
|
|
|
ASSERT_NE(s.ToString().find(manifest_path), std::string::npos);
|
2020-03-21 02:17:54 +00:00
|
|
|
ASSERT_TRUE(s.IsCorruption());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(VersionSetTestEmptyDb, OpenFromIncompleteManifest1) {
|
|
|
|
db_options_.write_dbid_to_manifest = std::get<0>(GetParam());
|
|
|
|
PrepareManifest(nullptr, nullptr, &log_writer_);
|
|
|
|
// Only a subset of column families in the MANIFEST.
|
|
|
|
VersionEdit new_cf1;
|
|
|
|
new_cf1.AddColumnFamily(VersionSetTestBase::kColumnFamilyName1);
|
|
|
|
new_cf1.SetColumnFamily(1);
|
|
|
|
Status s;
|
|
|
|
{
|
|
|
|
std::string record;
|
|
|
|
new_cf1.EncodeTo(&record);
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
s = log_writer_->AddRecord(WriteOptions(), record);
|
2020-03-21 02:17:54 +00:00
|
|
|
ASSERT_OK(s);
|
|
|
|
}
|
|
|
|
log_writer_.reset();
|
2024-08-24 02:49:25 +00:00
|
|
|
CreateCurrentFile();
|
2020-03-21 02:17:54 +00:00
|
|
|
|
|
|
|
std::string manifest_path;
|
|
|
|
VerifyManifest(&manifest_path);
|
|
|
|
|
|
|
|
bool read_only = std::get<1>(GetParam());
|
|
|
|
const std::vector<std::string>& cf_names = std::get<2>(GetParam());
|
|
|
|
std::vector<ColumnFamilyDescriptor> column_families;
|
|
|
|
for (const auto& cf_name : cf_names) {
|
|
|
|
column_families.emplace_back(cf_name, cf_options_);
|
|
|
|
}
|
|
|
|
std::string db_id;
|
|
|
|
bool has_missing_table_file = false;
|
|
|
|
s = versions_->TryRecoverFromOneManifest(manifest_path, column_families,
|
|
|
|
read_only, &db_id,
|
|
|
|
&has_missing_table_file);
|
|
|
|
auto iter =
|
|
|
|
std::find(cf_names.begin(), cf_names.end(), kDefaultColumnFamilyName);
|
|
|
|
if (iter == cf_names.end()) {
|
|
|
|
ASSERT_TRUE(s.IsInvalidArgument());
|
|
|
|
} else {
|
2022-01-08 02:08:50 +00:00
|
|
|
ASSERT_NE(s.ToString().find(manifest_path), std::string::npos);
|
2020-03-21 02:17:54 +00:00
|
|
|
ASSERT_TRUE(s.IsCorruption());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(VersionSetTestEmptyDb, OpenFromInCompleteManifest2) {
|
|
|
|
db_options_.write_dbid_to_manifest = std::get<0>(GetParam());
|
|
|
|
PrepareManifest(nullptr, nullptr, &log_writer_);
|
|
|
|
// Write all column families but no log_number, next_file_number and
|
|
|
|
// last_sequence.
|
|
|
|
const std::vector<std::string> all_cf_names = {
|
|
|
|
kDefaultColumnFamilyName, kColumnFamilyName1, kColumnFamilyName2,
|
|
|
|
kColumnFamilyName3};
|
|
|
|
uint32_t cf_id = 1;
|
|
|
|
Status s;
|
|
|
|
for (size_t i = 1; i != all_cf_names.size(); ++i) {
|
|
|
|
VersionEdit new_cf;
|
|
|
|
new_cf.AddColumnFamily(all_cf_names[i]);
|
|
|
|
new_cf.SetColumnFamily(cf_id++);
|
|
|
|
std::string record;
|
|
|
|
ASSERT_TRUE(new_cf.EncodeTo(&record));
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
s = log_writer_->AddRecord(WriteOptions(), record);
|
2020-03-21 02:17:54 +00:00
|
|
|
ASSERT_OK(s);
|
|
|
|
}
|
|
|
|
log_writer_.reset();
|
2024-08-24 02:49:25 +00:00
|
|
|
CreateCurrentFile();
|
2020-03-21 02:17:54 +00:00
|
|
|
|
|
|
|
std::string manifest_path;
|
|
|
|
VerifyManifest(&manifest_path);
|
|
|
|
|
|
|
|
bool read_only = std::get<1>(GetParam());
|
|
|
|
const std::vector<std::string>& cf_names = std::get<2>(GetParam());
|
|
|
|
std::vector<ColumnFamilyDescriptor> column_families;
|
|
|
|
for (const auto& cf_name : cf_names) {
|
|
|
|
column_families.emplace_back(cf_name, cf_options_);
|
|
|
|
}
|
|
|
|
std::string db_id;
|
|
|
|
bool has_missing_table_file = false;
|
|
|
|
s = versions_->TryRecoverFromOneManifest(manifest_path, column_families,
|
|
|
|
read_only, &db_id,
|
|
|
|
&has_missing_table_file);
|
|
|
|
auto iter =
|
|
|
|
std::find(cf_names.begin(), cf_names.end(), kDefaultColumnFamilyName);
|
|
|
|
if (iter == cf_names.end()) {
|
|
|
|
ASSERT_TRUE(s.IsInvalidArgument());
|
|
|
|
} else {
|
2022-01-08 02:08:50 +00:00
|
|
|
ASSERT_NE(s.ToString().find(manifest_path), std::string::npos);
|
2020-03-21 02:17:54 +00:00
|
|
|
ASSERT_TRUE(s.IsCorruption());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(VersionSetTestEmptyDb, OpenManifestWithUnknownCF) {
|
|
|
|
db_options_.write_dbid_to_manifest = std::get<0>(GetParam());
|
|
|
|
PrepareManifest(nullptr, nullptr, &log_writer_);
|
|
|
|
// Write all column families but no log_number, next_file_number and
|
|
|
|
// last_sequence.
|
|
|
|
const std::vector<std::string> all_cf_names = {
|
|
|
|
kDefaultColumnFamilyName, kColumnFamilyName1, kColumnFamilyName2,
|
|
|
|
kColumnFamilyName3};
|
|
|
|
uint32_t cf_id = 1;
|
|
|
|
Status s;
|
|
|
|
for (size_t i = 1; i != all_cf_names.size(); ++i) {
|
|
|
|
VersionEdit new_cf;
|
|
|
|
new_cf.AddColumnFamily(all_cf_names[i]);
|
|
|
|
new_cf.SetColumnFamily(cf_id++);
|
|
|
|
std::string record;
|
|
|
|
ASSERT_TRUE(new_cf.EncodeTo(&record));
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
s = log_writer_->AddRecord(WriteOptions(), record);
|
2020-03-21 02:17:54 +00:00
|
|
|
ASSERT_OK(s);
|
|
|
|
}
|
|
|
|
{
|
|
|
|
VersionEdit tmp_edit;
|
|
|
|
tmp_edit.SetColumnFamily(4);
|
|
|
|
tmp_edit.SetLogNumber(0);
|
|
|
|
tmp_edit.SetNextFile(2);
|
|
|
|
tmp_edit.SetLastSequence(0);
|
|
|
|
std::string record;
|
|
|
|
ASSERT_TRUE(tmp_edit.EncodeTo(&record));
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
s = log_writer_->AddRecord(WriteOptions(), record);
|
2020-03-21 02:17:54 +00:00
|
|
|
ASSERT_OK(s);
|
|
|
|
}
|
|
|
|
log_writer_.reset();
|
2024-08-24 02:49:25 +00:00
|
|
|
CreateCurrentFile();
|
2020-03-21 02:17:54 +00:00
|
|
|
|
|
|
|
std::string manifest_path;
|
|
|
|
VerifyManifest(&manifest_path);
|
|
|
|
|
|
|
|
bool read_only = std::get<1>(GetParam());
|
|
|
|
const std::vector<std::string>& cf_names = std::get<2>(GetParam());
|
|
|
|
std::vector<ColumnFamilyDescriptor> column_families;
|
|
|
|
for (const auto& cf_name : cf_names) {
|
|
|
|
column_families.emplace_back(cf_name, cf_options_);
|
|
|
|
}
|
|
|
|
std::string db_id;
|
|
|
|
bool has_missing_table_file = false;
|
|
|
|
s = versions_->TryRecoverFromOneManifest(manifest_path, column_families,
|
|
|
|
read_only, &db_id,
|
|
|
|
&has_missing_table_file);
|
|
|
|
auto iter =
|
|
|
|
std::find(cf_names.begin(), cf_names.end(), kDefaultColumnFamilyName);
|
|
|
|
if (iter == cf_names.end()) {
|
|
|
|
ASSERT_TRUE(s.IsInvalidArgument());
|
|
|
|
} else {
|
2022-01-08 02:08:50 +00:00
|
|
|
ASSERT_NE(s.ToString().find(manifest_path), std::string::npos);
|
2020-03-21 02:17:54 +00:00
|
|
|
ASSERT_TRUE(s.IsCorruption());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_P(VersionSetTestEmptyDb, OpenCompleteManifest) {
|
|
|
|
db_options_.write_dbid_to_manifest = std::get<0>(GetParam());
|
|
|
|
PrepareManifest(nullptr, nullptr, &log_writer_);
|
|
|
|
// Write all column families but no log_number, next_file_number and
|
|
|
|
// last_sequence.
|
|
|
|
const std::vector<std::string> all_cf_names = {
|
|
|
|
kDefaultColumnFamilyName, kColumnFamilyName1, kColumnFamilyName2,
|
|
|
|
kColumnFamilyName3};
|
|
|
|
uint32_t cf_id = 1;
|
|
|
|
Status s;
|
|
|
|
for (size_t i = 1; i != all_cf_names.size(); ++i) {
|
|
|
|
VersionEdit new_cf;
|
|
|
|
new_cf.AddColumnFamily(all_cf_names[i]);
|
|
|
|
new_cf.SetColumnFamily(cf_id++);
|
|
|
|
std::string record;
|
|
|
|
ASSERT_TRUE(new_cf.EncodeTo(&record));
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
s = log_writer_->AddRecord(WriteOptions(), record);
|
2020-03-21 02:17:54 +00:00
|
|
|
ASSERT_OK(s);
|
|
|
|
}
|
|
|
|
{
|
|
|
|
VersionEdit tmp_edit;
|
|
|
|
tmp_edit.SetLogNumber(0);
|
|
|
|
tmp_edit.SetNextFile(2);
|
|
|
|
tmp_edit.SetLastSequence(0);
|
|
|
|
std::string record;
|
|
|
|
ASSERT_TRUE(tmp_edit.EncodeTo(&record));
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
s = log_writer_->AddRecord(WriteOptions(), record);
|
2020-03-21 02:17:54 +00:00
|
|
|
ASSERT_OK(s);
|
|
|
|
}
|
|
|
|
log_writer_.reset();
|
2024-08-24 02:49:25 +00:00
|
|
|
CreateCurrentFile();
|
2020-03-21 02:17:54 +00:00
|
|
|
|
|
|
|
std::string manifest_path;
|
|
|
|
VerifyManifest(&manifest_path);
|
|
|
|
|
|
|
|
bool read_only = std::get<1>(GetParam());
|
|
|
|
const std::vector<std::string>& cf_names = std::get<2>(GetParam());
|
|
|
|
std::vector<ColumnFamilyDescriptor> column_families;
|
|
|
|
for (const auto& cf_name : cf_names) {
|
|
|
|
column_families.emplace_back(cf_name, cf_options_);
|
|
|
|
}
|
|
|
|
std::string db_id;
|
|
|
|
bool has_missing_table_file = false;
|
|
|
|
s = versions_->TryRecoverFromOneManifest(manifest_path, column_families,
|
|
|
|
read_only, &db_id,
|
|
|
|
&has_missing_table_file);
|
|
|
|
auto iter =
|
|
|
|
std::find(cf_names.begin(), cf_names.end(), kDefaultColumnFamilyName);
|
|
|
|
if (iter == cf_names.end()) {
|
|
|
|
ASSERT_TRUE(s.IsInvalidArgument());
|
|
|
|
} else if (read_only) {
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_FALSE(has_missing_table_file);
|
|
|
|
} else if (cf_names.size() == all_cf_names.size()) {
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_FALSE(has_missing_table_file);
|
|
|
|
} else if (cf_names.size() < all_cf_names.size()) {
|
|
|
|
ASSERT_TRUE(s.IsInvalidArgument());
|
|
|
|
} else {
|
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_FALSE(has_missing_table_file);
|
|
|
|
ColumnFamilyData* cfd = versions_->GetColumnFamilySet()->GetColumnFamily(
|
|
|
|
kUnknownColumnFamilyName);
|
|
|
|
ASSERT_EQ(nullptr, cfd);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-06-03 22:53:09 +00:00
|
|
|
INSTANTIATE_TEST_CASE_P(
|
2020-03-21 02:17:54 +00:00
|
|
|
BestEffortRecovery, VersionSetTestEmptyDb,
|
|
|
|
testing::Combine(
|
|
|
|
/*write_dbid_to_manifest=*/testing::Bool(),
|
|
|
|
/*read_only=*/testing::Bool(),
|
|
|
|
/*cf_names=*/
|
|
|
|
testing::Values(
|
|
|
|
std::vector<std::string>(),
|
|
|
|
std::vector<std::string>({kDefaultColumnFamilyName}),
|
|
|
|
std::vector<std::string>({VersionSetTestBase::kColumnFamilyName1,
|
|
|
|
VersionSetTestBase::kColumnFamilyName2,
|
|
|
|
VersionSetTestBase::kColumnFamilyName3}),
|
|
|
|
std::vector<std::string>({kDefaultColumnFamilyName,
|
|
|
|
VersionSetTestBase::kColumnFamilyName1}),
|
|
|
|
std::vector<std::string>({kDefaultColumnFamilyName,
|
|
|
|
VersionSetTestBase::kColumnFamilyName1,
|
|
|
|
VersionSetTestBase::kColumnFamilyName2,
|
|
|
|
VersionSetTestBase::kColumnFamilyName3}),
|
|
|
|
std::vector<std::string>(
|
|
|
|
{kDefaultColumnFamilyName,
|
|
|
|
VersionSetTestBase::kColumnFamilyName1,
|
|
|
|
VersionSetTestBase::kColumnFamilyName2,
|
|
|
|
VersionSetTestBase::kColumnFamilyName3,
|
|
|
|
VersionSetTestEmptyDb::kUnknownColumnFamilyName}))));
|
|
|
|
|
|
|
|
class VersionSetTestMissingFiles : public VersionSetTestBase,
|
|
|
|
public testing::Test {
|
|
|
|
public:
|
2024-08-17 00:18:54 +00:00
|
|
|
explicit VersionSetTestMissingFiles(
|
|
|
|
const std::string& test_name = "version_set_test_missing_files")
|
|
|
|
: VersionSetTestBase(test_name),
|
2020-03-21 02:17:54 +00:00
|
|
|
internal_comparator_(
|
|
|
|
std::make_shared<InternalKeyComparator>(options_.comparator)) {}
|
|
|
|
|
|
|
|
protected:
|
|
|
|
void PrepareManifest(std::vector<ColumnFamilyDescriptor>* column_families,
|
|
|
|
SequenceNumber* last_seqno,
|
|
|
|
std::unique_ptr<log::Writer>* log_writer) override {
|
|
|
|
assert(column_families != nullptr);
|
|
|
|
assert(last_seqno != nullptr);
|
|
|
|
assert(log_writer != nullptr);
|
2024-09-19 21:05:21 +00:00
|
|
|
ASSERT_OK(
|
|
|
|
SetIdentityFile(WriteOptions(), env_, dbname_, Temperature::kUnknown));
|
2020-03-21 02:17:54 +00:00
|
|
|
const std::string manifest = DescriptorFileName(dbname_, 1);
|
2021-01-29 06:08:46 +00:00
|
|
|
const auto& fs = env_->GetFileSystem();
|
|
|
|
std::unique_ptr<WritableFileWriter> file_writer;
|
|
|
|
Status s = WritableFileWriter::Create(
|
|
|
|
fs, manifest, fs->OptimizeForManifestWrite(env_options_), &file_writer,
|
|
|
|
nullptr);
|
2020-03-21 02:17:54 +00:00
|
|
|
ASSERT_OK(s);
|
|
|
|
log_writer->reset(new log::Writer(std::move(file_writer), 0, false));
|
|
|
|
VersionEdit new_db;
|
|
|
|
if (db_options_.write_dbid_to_manifest) {
|
Fix many tests to run with MEM_ENV and ENCRYPTED_ENV; Introduce a MemoryFileSystem class (#7566)
Summary:
This PR does a few things:
1. The MockFileSystem class was split out from the MockEnv. This change would theoretically allow a MockFileSystem to be used by other Environments as well (if we created a means of constructing one). The MockFileSystem implements a FileSystem in its entirety and does not rely on any Wrapper implementation.
2. Make the RocksDB test suite work when MOCK_ENV=1 and ENCRYPTED_ENV=1 are set. To accomplish this, a few things were needed:
- The tests that tried to use the "wrong" environment (Env::Default() instead of env_) were updated
- The MockFileSystem was changed to support the features it was missing or mishandled (such as recursively deleting files in a directory or supporting renaming of a directory).
3. Updated the test framework to have a ROCKSDB_GTEST_SKIP macro. This can be used to flag tests that are skipped. Currently, this defaults to doing nothing (marks the test as SUCCESS) but will mark the tests as SKIPPED when RocksDB is upgraded to a version of gtest that supports this (gtest-1.10).
I have run a full "make check" with MEM_ENV, ENCRYPTED_ENV, both, and neither under both MacOS and RedHat. A few tests were disabled/skipped for the MEM/ENCRYPTED cases. The error_handler_fs_test fails/hangs for MEM_ENV (presumably a timing problem) and I will introduce another PR/issue to track that problem. (I will also push a change to disable those tests soon). There is one more test in DBTest2 that also fails which I need to investigate or skip before this PR is merged.
Theoretically, this PR should also allow the test suite to run against an Env loaded from the registry, though I do not have one to try it with currently.
Finally, once this is accepted, it would be nice if there was a CircleCI job to run these tests on a checkin so this effort does not become stale. I do not know how to do that, so if someone could write that job, it would be appreciated :)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7566
Reviewed By: zhichao-cao
Differential Revision: D24408980
Pulled By: jay-zhuang
fbshipit-source-id: 911b1554a4d0da06fd51feca0c090a4abdcb4a5f
2020-10-27 17:31:34 +00:00
|
|
|
DBOptions tmp_db_options;
|
|
|
|
tmp_db_options.env = env_;
|
|
|
|
std::unique_ptr<DBImpl> impl(new DBImpl(tmp_db_options, dbname_));
|
2020-03-21 02:17:54 +00:00
|
|
|
std::string db_id;
|
2023-08-09 22:46:44 +00:00
|
|
|
ASSERT_OK(impl->GetDbIdentityFromIdentityFile(&db_id));
|
2020-03-21 02:17:54 +00:00
|
|
|
new_db.SetDBId(db_id);
|
|
|
|
}
|
|
|
|
{
|
|
|
|
std::string record;
|
|
|
|
ASSERT_TRUE(new_db.EncodeTo(&record));
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
s = (*log_writer)->AddRecord(WriteOptions(), record);
|
2020-03-21 02:17:54 +00:00
|
|
|
ASSERT_OK(s);
|
|
|
|
}
|
|
|
|
const std::vector<std::string> cf_names = {
|
|
|
|
kDefaultColumnFamilyName, kColumnFamilyName1, kColumnFamilyName2,
|
|
|
|
kColumnFamilyName3};
|
|
|
|
uint32_t cf_id = 1; // default cf id is 0
|
|
|
|
cf_options_.table_factory = table_factory_;
|
|
|
|
for (const auto& cf_name : cf_names) {
|
|
|
|
column_families->emplace_back(cf_name, cf_options_);
|
|
|
|
if (cf_name == kDefaultColumnFamilyName) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
VersionEdit new_cf;
|
|
|
|
new_cf.AddColumnFamily(cf_name);
|
|
|
|
new_cf.SetColumnFamily(cf_id);
|
|
|
|
std::string record;
|
|
|
|
ASSERT_TRUE(new_cf.EncodeTo(&record));
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
s = (*log_writer)->AddRecord(WriteOptions(), record);
|
2020-03-21 02:17:54 +00:00
|
|
|
ASSERT_OK(s);
|
|
|
|
|
|
|
|
VersionEdit cf_files;
|
|
|
|
cf_files.SetColumnFamily(cf_id);
|
|
|
|
cf_files.SetLogNumber(0);
|
|
|
|
record.clear();
|
|
|
|
ASSERT_TRUE(cf_files.EncodeTo(&record));
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
s = (*log_writer)->AddRecord(WriteOptions(), record);
|
2020-03-21 02:17:54 +00:00
|
|
|
ASSERT_OK(s);
|
|
|
|
++cf_id;
|
|
|
|
}
|
|
|
|
SequenceNumber seq = 2;
|
|
|
|
{
|
|
|
|
VersionEdit edit;
|
|
|
|
edit.SetNextFile(7);
|
|
|
|
edit.SetLastSequence(seq);
|
|
|
|
std::string record;
|
|
|
|
ASSERT_TRUE(edit.EncodeTo(&record));
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
s = (*log_writer)->AddRecord(WriteOptions(), record);
|
2020-03-21 02:17:54 +00:00
|
|
|
ASSERT_OK(s);
|
|
|
|
}
|
|
|
|
*last_seqno = seq + 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
// This method updates last_sequence_.
|
|
|
|
void WriteFileAdditionAndDeletionToManifest(
|
|
|
|
uint32_t cf, const std::vector<std::pair<int, FileMetaData>>& added_files,
|
2024-08-17 00:18:54 +00:00
|
|
|
const std::vector<std::pair<int, uint64_t>>& deleted_files,
|
|
|
|
const std::vector<BlobFileAddition>& blob_files = {}) {
|
2020-03-21 02:17:54 +00:00
|
|
|
VersionEdit edit;
|
|
|
|
edit.SetColumnFamily(cf);
|
|
|
|
for (const auto& elem : added_files) {
|
|
|
|
int level = elem.first;
|
|
|
|
edit.AddFile(level, elem.second);
|
|
|
|
}
|
|
|
|
for (const auto& elem : deleted_files) {
|
|
|
|
int level = elem.first;
|
|
|
|
edit.DeleteFile(level, elem.second);
|
|
|
|
}
|
2024-08-17 00:18:54 +00:00
|
|
|
for (const auto& elem : blob_files) {
|
|
|
|
edit.AddBlobFile(elem);
|
|
|
|
}
|
2020-03-21 02:17:54 +00:00
|
|
|
edit.SetLastSequence(last_seqno_);
|
|
|
|
++last_seqno_;
|
|
|
|
assert(log_writer_.get() != nullptr);
|
|
|
|
std::string record;
|
2023-07-10 18:03:25 +00:00
|
|
|
ASSERT_TRUE(edit.EncodeTo(&record, 0 /* ts_sz */));
|
Group SST write in flush, compaction and db open with new stats (#11910)
Summary:
## Context/Summary
Similar to https://github.com/facebook/rocksdb/pull/11288, https://github.com/facebook/rocksdb/pull/11444, categorizing SST/blob file write according to different io activities allows more insight into the activity.
For that, this PR does the following:
- Tag different write IOs by passing down and converting WriteOptions to IOOptions
- Add new SST_WRITE_MICROS histogram in WritableFileWriter::Append() and breakdown FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS
Some related code refactory to make implementation cleaner:
- Blob stats
- Replace high-level write measurement with low-level WritableFileWriter::Append() measurement for BLOB_DB_BLOB_FILE_WRITE_MICROS. This is to make FILE_WRITE_{FLUSH|COMPACTION|DB_OPEN}_MICROS include blob file. As a consequence, this introduces some behavioral changes on it, see HISTORY and db bench test plan below for more info.
- Fix bugs where BLOB_DB_BLOB_FILE_SYNCED/BLOB_DB_BLOB_FILE_BYTES_WRITTEN include file failed to sync and bytes failed to write.
- Refactor WriteOptions constructor for easier construction with io_activity and rate_limiter_priority
- Refactor DBImpl::~DBImpl()/BlobDBImpl::Close() to bypass thread op verification
- Build table
- TableBuilderOptions now includes Read/WriteOpitons so BuildTable() do not need to take these two variables
- Replace the io_priority passed into BuildTable() with TableBuilderOptions::WriteOpitons::rate_limiter_priority. Similar for BlobFileBuilder.
This parameter is used for dynamically changing file io priority for flush, see https://github.com/facebook/rocksdb/pull/9988?fbclid=IwAR1DtKel6c-bRJAdesGo0jsbztRtciByNlvokbxkV6h_L-AE9MACzqRTT5s for more
- Update ThreadStatus::FLUSH_BYTES_WRITTEN to use io_activity to track flush IO in flush job and db open instead of io_priority
## Test
### db bench
Flush
```
./db_bench --statistics=1 --benchmarks=fillseq --num=100000 --write_buffer_size=100
rocksdb.sst.write.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.flush.micros P50 : 1.830863 P95 : 4.094720 P99 : 6.578947 P100 : 26.000000 COUNT : 7875 SUM : 20377
rocksdb.file.write.compaction.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.db.open.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
```
compaction, db oopen
```
Setup: ./db_bench --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
rocksdb.sst.write.micros P50 : 2.675325 P95 : 9.578788 P99 : 18.780000 P100 : 314.000000 COUNT : 638 SUM : 3279
rocksdb.file.write.flush.micros P50 : 0.000000 P95 : 0.000000 P99 : 0.000000 P100 : 0.000000 COUNT : 0 SUM : 0
rocksdb.file.write.compaction.micros P50 : 2.757353 P95 : 9.610687 P99 : 19.316667 P100 : 314.000000 COUNT : 615 SUM : 3213
rocksdb.file.write.db.open.micros P50 : 2.055556 P95 : 3.925000 P99 : 9.000000 P100 : 9.000000 COUNT : 23 SUM : 66
```
blob stats - just to make sure they aren't broken by this PR
```
Integrated Blob DB
Setup: ./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
Run:./db_bench --enable_blob_files=1 --statistics=1 --benchmarks=compact --db=../db_bench --use_existing_db=1
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 7.298246 P95 : 9.771930 P99 : 9.991813 P100 : 16.000000 COUNT : 235 SUM : 1600
rocksdb.blobdb.blob.file.synced COUNT : 1
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 2.000000 P95 : 2.829360 P99 : 2.993779 P100 : 9.000000 COUNT : 707 SUM : 1614
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 1 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 34842 (stay the same)
```
```
Stacked Blob DB
Run: ./db_bench --use_blob_db=1 --statistics=1 --benchmarks=fillseq --num=10000 --disable_auto_compactions=1 -write_buffer_size=100 --db=../db_bench
pre-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 12.808042 P95 : 19.674497 P99 : 28.539683 P100 : 51.000000 COUNT : 10000 SUM : 140876
rocksdb.blobdb.blob.file.synced COUNT : 8
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445
post-PR:
rocksdb.blobdb.blob.file.write.micros P50 : 1.657370 P95 : 2.952175 P99 : 3.877519 P100 : 24.000000 COUNT : 30001 SUM : 67924
- COUNT is higher and values are smaller as it includes header and footer write
- COUNT is 3X higher due to each Append() count as one post-PR, while in pre-PR, 3 Append()s counts as one. See https://github.com/facebook/rocksdb/pull/11910/files#diff-32b811c0a1c000768cfb2532052b44dc0b3bf82253f3eab078e15ff201a0dabfL157-L164
rocksdb.blobdb.blob.file.synced COUNT : 8 (stay the same)
rocksdb.blobdb.blob.file.bytes.written COUNT : 1043445 (stay the same)
```
### Rehearsal CI stress test
Trigger 3 full runs of all our CI stress tests
### Performance
Flush
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=ManualFlush/key_num:524288/per_key_size:256 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark; enable_statistics = true
Pre-pr: avg 507515519.3 ns
497686074,499444327,500862543,501389862,502994471,503744435,504142123,504224056,505724198,506610393,506837742,506955122,507695561,507929036,508307733,508312691,508999120,509963561,510142147,510698091,510743096,510769317,510957074,511053311,511371367,511409911,511432960,511642385,511691964,511730908,
Post-pr: avg 511971266.5 ns, regressed 0.88%
502744835,506502498,507735420,507929724,508313335,509548582,509994942,510107257,510715603,511046955,511352639,511458478,512117521,512317380,512766303,512972652,513059586,513804934,513808980,514059409,514187369,514389494,514447762,514616464,514622882,514641763,514666265,514716377,514990179,515502408,
```
Compaction
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{pre|post}_pr --benchmark_filter=ManualCompaction/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 495346098.30 ns
492118301,493203526,494201411,494336607,495269217,495404950,496402598,497012157,497358370,498153846
Post-pr: avg 504528077.20, regressed 1.85%. "ManualCompaction" include flush so the isolated regression for compaction should be around 1.85-0.88 = 0.97%
502465338,502485945,502541789,502909283,503438601,504143885,506113087,506629423,507160414,507393007
```
Put with WAL (in case passing WriteOptions slows down this path even without collecting SST write stats)
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_pre_pr --benchmark_filter=DBPut/comp_style:0/max_data:107374182400/per_key_size:256/enable_statistics:1/wal:1 --benchmark_repetitions=1000
-- default: 1 thread is used to run benchmark
Pre-pr: avg 3848.10 ns
3814,3838,3839,3848,3854,3854,3854,3860,3860,3860
Post-pr: avg 3874.20 ns, regressed 0.68%
3863,3867,3871,3874,3875,3877,3877,3877,3880,3881
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11910
Reviewed By: ajkr
Differential Revision: D49788060
Pulled By: hx235
fbshipit-source-id: 79e73699cda5be3b66461687e5147c2484fc5eff
2023-12-29 23:29:23 +00:00
|
|
|
Status s = log_writer_->AddRecord(WriteOptions(), record);
|
2020-03-21 02:17:54 +00:00
|
|
|
ASSERT_OK(s);
|
|
|
|
}
|
|
|
|
|
|
|
|
std::shared_ptr<InternalKeyComparator> internal_comparator_;
|
|
|
|
std::vector<ColumnFamilyDescriptor> column_families_;
|
|
|
|
SequenceNumber last_seqno_;
|
|
|
|
std::unique_ptr<log::Writer> log_writer_;
|
|
|
|
};
|
|
|
|
|
|
|
|
TEST_F(VersionSetTestMissingFiles, ManifestFarBehindSst) {
|
|
|
|
std::vector<SstInfo> existing_files = {
|
Sort L0 files by newly introduced epoch_num (#10922)
Summary:
**Context:**
Sorting L0 files by `largest_seqno` has at least two inconvenience:
- File ingestion and compaction involving ingested files can create files of overlapping seqno range with the existing files. `force_consistency_check=true` will catch such overlap seqno range even those harmless overlap.
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n")
- insert k1@1 to memtable m1
- ingest file s1 with k2@2, ingest file s2 with k3@3
- insert k4@4 to m1
- compact files s1, s2 and result in new file s3 of seqno range [2, 3]
- flush m1 and result in new file s4 of seqno range [1, 4]. And `force_consistency_check=true` will think s4 and s3 has file reordering corruption that might cause retuning an old value of k1
- However such caught corruption is a false positive since s1, s2 will not have overlapped keys with k1 or whatever inserted into m1 before ingest file s1 by the requirement of file ingestion (otherwise the m1 will be flushed first before any of the file ingestion completes). Therefore there in fact isn't any file reordering corruption.
- Single delete can decrease a file's largest seqno and ordering by `largest_seqno` can introduce a wrong ordering hence file reordering corruption
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n", Credit to ajkr for this example)
- an existing SST s1 contains only k1@1
- insert k1@2 to memtable m1
- ingest file s2 with k3@3, ingest file s3 with k4@4
- insert single delete k5@5 in m1
- flush m1 and result in new file s4 of seqno range [2, 5]
- compact s1, s2, s3 and result in new file s5 of seqno range [1, 4]
- compact s4 and result in new file s6 of seqno range [2] due to single delete
- By the last step, we have file ordering by largest seqno (">" means "newer") : s5 > s6 while s6 contains a newer version of the k1's value (i.e, k1@2) than s5, which is a real reordering corruption. While this can be caught by `force_consistency_check=true`, there isn't a good way to prevent this from happening if ordering by `largest_seqno`
Therefore, we are redesigning the sorting criteria of L0 files and avoid above inconvenience. Credit to ajkr , we now introduce `epoch_num` which describes the order of a file being flushed or ingested/imported (compaction output file will has the minimum `epoch_num` among input files'). This will avoid the above inconvenience in the following ways:
- In the first case above, there will no longer be overlap seqno range check in `force_consistency_check=true` but `epoch_number` ordering check. This will result in file ordering s1 < s2 < s4 (pre-compaction) and s3 < s4 (post-compaction) which won't trigger false positive corruption. See test class `DBCompactionTestL0FilesMisorderCorruption*` for more.
- In the second case above, this will result in file ordering s1 < s2 < s3 < s4 (pre-compacting s1, s2, s3), s5 < s4 (post-compacting s1, s2, s3), s5 < s6 (post-compacting s4), which are correct file ordering without causing any corruption.
**Summary:**
- Introduce `epoch_number` stored per `ColumnFamilyData` and sort CF's L0 files by their assigned `epoch_number` instead of `largest_seqno`.
- `epoch_number` is increased and assigned upon `VersionEdit::AddFile()` for flush (or similarly for WriteLevel0TableForRecovery) and file ingestion (except for allow_behind_true, which will always get assigned as the `kReservedEpochNumberForFileIngestedBehind`)
- Compaction output file is assigned with the minimum `epoch_number` among input files'
- Refit level: reuse refitted file's epoch_number
- Other paths needing `epoch_number` treatment:
- Import column families: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`
- Repair: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`.
- Assigning new epoch_number to a file and adding this file to LSM tree should be atomic. This is guaranteed by us assigning epoch_number right upon `VersionEdit::AddFile()` where this version edit will be apply to LSM tree shape right after by holding the db mutex (e.g, flush, file ingestion, import column family) or by there is only 1 ongoing edit per CF (e.g, WriteLevel0TableForRecovery, Repair).
- Assigning the minimum input epoch number to compaction output file won't misorder L0 files (even through later `Refit(target_level=0)`). It's due to for every key "k" in the input range, a legit compaction will cover a continuous epoch number range of that key. As long as we assign the key "k" the minimum input epoch number, it won't become newer or older than the versions of this key that aren't included in this compaction hence no misorder.
- Persist `epoch_number` of each file in manifest and recover `epoch_number` on db recovery
- Backward compatibility with old db without `epoch_number` support is guaranteed by assigning `epoch_number` to recovered files by `NewestFirstBySeqno` order. See `VersionStorageInfo::RecoverEpochNumbers()` for more
- Forward compatibility with manifest is guaranteed by flexibility of `NewFileCustomTag`
- Replace `force_consistent_check` on L0 with `epoch_number` and remove false positive check like case 1 with `largest_seqno` above
- Due to backward compatibility issue, we might encounter files with missing epoch number at the beginning of db recovery. We will still use old L0 sorting mechanism (`NewestFirstBySeqno`) to check/sort them till we infer their epoch number. See usages of `EpochNumberRequirement`.
- Remove fix https://github.com/facebook/rocksdb/pull/5958#issue-511150930 and their outdated tests to file reordering corruption because such fix can be replaced by this PR.
- Misc:
- update existing tests with `epoch_number` so make check will pass
- update https://github.com/facebook/rocksdb/pull/5958#issue-511150930 tests to verify corruption is fixed using `epoch_number` and cover universal/fifo compaction/CompactRange/CompactFile cases
- assert db_mutex is held for a few places before calling ColumnFamilyData::NewEpochNumber()
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10922
Test Plan:
- `make check`
- New unit tests under `db/db_compaction_test.cc`, `db/db_test2.cc`, `db/version_builder_test.cc`, `db/repair_test.cc`
- Updated tests (i.e, `DBCompactionTestL0FilesMisorderCorruption*`) under https://github.com/facebook/rocksdb/pull/5958#issue-511150930
- [Ongoing] Compatibility test: manually run https://github.com/ajkr/rocksdb/commit/36a5686ec012f35a4371e409aa85c404ca1c210d (with file ingestion off for running the `.orig` binary to prevent this bug affecting upgrade/downgrade formality checking) for 1 hour on `simple black/white box`, `cf_consistency/txn/enable_ts with whitebox + test_best_efforts_recovery with blackbox`
- [Ongoing] normal db stress test
- [Ongoing] db stress test with aggressive value https://github.com/facebook/rocksdb/pull/10761
Reviewed By: ajkr
Differential Revision: D41063187
Pulled By: hx235
fbshipit-source-id: 826cb23455de7beaabe2d16c57682a82733a32a9
2022-12-13 21:29:37 +00:00
|
|
|
SstInfo(100, kDefaultColumnFamilyName, "a", 100 /* epoch_number */),
|
|
|
|
SstInfo(102, kDefaultColumnFamilyName, "b", 102 /* epoch_number */),
|
|
|
|
SstInfo(103, kDefaultColumnFamilyName, "c", 103 /* epoch_number */),
|
|
|
|
SstInfo(107, kDefaultColumnFamilyName, "d", 107 /* epoch_number */),
|
|
|
|
SstInfo(110, kDefaultColumnFamilyName, "e", 110 /* epoch_number */)};
|
2020-03-21 02:17:54 +00:00
|
|
|
std::vector<FileMetaData> file_metas;
|
|
|
|
CreateDummyTableFiles(existing_files, &file_metas);
|
|
|
|
|
|
|
|
PrepareManifest(&column_families_, &last_seqno_, &log_writer_);
|
|
|
|
std::vector<std::pair<int, FileMetaData>> added_files;
|
|
|
|
for (uint64_t file_num = 10; file_num < 15; ++file_num) {
|
|
|
|
std::string smallest_ukey = "a";
|
|
|
|
std::string largest_ukey = "b";
|
|
|
|
InternalKey smallest_ikey(smallest_ukey, 1, ValueType::kTypeValue);
|
|
|
|
InternalKey largest_ikey(largest_ukey, 1, ValueType::kTypeValue);
|
Sort L0 files by newly introduced epoch_num (#10922)
Summary:
**Context:**
Sorting L0 files by `largest_seqno` has at least two inconvenience:
- File ingestion and compaction involving ingested files can create files of overlapping seqno range with the existing files. `force_consistency_check=true` will catch such overlap seqno range even those harmless overlap.
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n")
- insert k1@1 to memtable m1
- ingest file s1 with k2@2, ingest file s2 with k3@3
- insert k4@4 to m1
- compact files s1, s2 and result in new file s3 of seqno range [2, 3]
- flush m1 and result in new file s4 of seqno range [1, 4]. And `force_consistency_check=true` will think s4 and s3 has file reordering corruption that might cause retuning an old value of k1
- However such caught corruption is a false positive since s1, s2 will not have overlapped keys with k1 or whatever inserted into m1 before ingest file s1 by the requirement of file ingestion (otherwise the m1 will be flushed first before any of the file ingestion completes). Therefore there in fact isn't any file reordering corruption.
- Single delete can decrease a file's largest seqno and ordering by `largest_seqno` can introduce a wrong ordering hence file reordering corruption
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n", Credit to ajkr for this example)
- an existing SST s1 contains only k1@1
- insert k1@2 to memtable m1
- ingest file s2 with k3@3, ingest file s3 with k4@4
- insert single delete k5@5 in m1
- flush m1 and result in new file s4 of seqno range [2, 5]
- compact s1, s2, s3 and result in new file s5 of seqno range [1, 4]
- compact s4 and result in new file s6 of seqno range [2] due to single delete
- By the last step, we have file ordering by largest seqno (">" means "newer") : s5 > s6 while s6 contains a newer version of the k1's value (i.e, k1@2) than s5, which is a real reordering corruption. While this can be caught by `force_consistency_check=true`, there isn't a good way to prevent this from happening if ordering by `largest_seqno`
Therefore, we are redesigning the sorting criteria of L0 files and avoid above inconvenience. Credit to ajkr , we now introduce `epoch_num` which describes the order of a file being flushed or ingested/imported (compaction output file will has the minimum `epoch_num` among input files'). This will avoid the above inconvenience in the following ways:
- In the first case above, there will no longer be overlap seqno range check in `force_consistency_check=true` but `epoch_number` ordering check. This will result in file ordering s1 < s2 < s4 (pre-compaction) and s3 < s4 (post-compaction) which won't trigger false positive corruption. See test class `DBCompactionTestL0FilesMisorderCorruption*` for more.
- In the second case above, this will result in file ordering s1 < s2 < s3 < s4 (pre-compacting s1, s2, s3), s5 < s4 (post-compacting s1, s2, s3), s5 < s6 (post-compacting s4), which are correct file ordering without causing any corruption.
**Summary:**
- Introduce `epoch_number` stored per `ColumnFamilyData` and sort CF's L0 files by their assigned `epoch_number` instead of `largest_seqno`.
- `epoch_number` is increased and assigned upon `VersionEdit::AddFile()` for flush (or similarly for WriteLevel0TableForRecovery) and file ingestion (except for allow_behind_true, which will always get assigned as the `kReservedEpochNumberForFileIngestedBehind`)
- Compaction output file is assigned with the minimum `epoch_number` among input files'
- Refit level: reuse refitted file's epoch_number
- Other paths needing `epoch_number` treatment:
- Import column families: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`
- Repair: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`.
- Assigning new epoch_number to a file and adding this file to LSM tree should be atomic. This is guaranteed by us assigning epoch_number right upon `VersionEdit::AddFile()` where this version edit will be apply to LSM tree shape right after by holding the db mutex (e.g, flush, file ingestion, import column family) or by there is only 1 ongoing edit per CF (e.g, WriteLevel0TableForRecovery, Repair).
- Assigning the minimum input epoch number to compaction output file won't misorder L0 files (even through later `Refit(target_level=0)`). It's due to for every key "k" in the input range, a legit compaction will cover a continuous epoch number range of that key. As long as we assign the key "k" the minimum input epoch number, it won't become newer or older than the versions of this key that aren't included in this compaction hence no misorder.
- Persist `epoch_number` of each file in manifest and recover `epoch_number` on db recovery
- Backward compatibility with old db without `epoch_number` support is guaranteed by assigning `epoch_number` to recovered files by `NewestFirstBySeqno` order. See `VersionStorageInfo::RecoverEpochNumbers()` for more
- Forward compatibility with manifest is guaranteed by flexibility of `NewFileCustomTag`
- Replace `force_consistent_check` on L0 with `epoch_number` and remove false positive check like case 1 with `largest_seqno` above
- Due to backward compatibility issue, we might encounter files with missing epoch number at the beginning of db recovery. We will still use old L0 sorting mechanism (`NewestFirstBySeqno`) to check/sort them till we infer their epoch number. See usages of `EpochNumberRequirement`.
- Remove fix https://github.com/facebook/rocksdb/pull/5958#issue-511150930 and their outdated tests to file reordering corruption because such fix can be replaced by this PR.
- Misc:
- update existing tests with `epoch_number` so make check will pass
- update https://github.com/facebook/rocksdb/pull/5958#issue-511150930 tests to verify corruption is fixed using `epoch_number` and cover universal/fifo compaction/CompactRange/CompactFile cases
- assert db_mutex is held for a few places before calling ColumnFamilyData::NewEpochNumber()
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10922
Test Plan:
- `make check`
- New unit tests under `db/db_compaction_test.cc`, `db/db_test2.cc`, `db/version_builder_test.cc`, `db/repair_test.cc`
- Updated tests (i.e, `DBCompactionTestL0FilesMisorderCorruption*`) under https://github.com/facebook/rocksdb/pull/5958#issue-511150930
- [Ongoing] Compatibility test: manually run https://github.com/ajkr/rocksdb/commit/36a5686ec012f35a4371e409aa85c404ca1c210d (with file ingestion off for running the `.orig` binary to prevent this bug affecting upgrade/downgrade formality checking) for 1 hour on `simple black/white box`, `cf_consistency/txn/enable_ts with whitebox + test_best_efforts_recovery with blackbox`
- [Ongoing] normal db stress test
- [Ongoing] db stress test with aggressive value https://github.com/facebook/rocksdb/pull/10761
Reviewed By: ajkr
Differential Revision: D41063187
Pulled By: hx235
fbshipit-source-id: 826cb23455de7beaabe2d16c57682a82733a32a9
2022-12-13 21:29:37 +00:00
|
|
|
|
2021-12-03 22:42:05 +00:00
|
|
|
FileMetaData meta = FileMetaData(
|
|
|
|
file_num, /*file_path_id=*/0, /*file_size=*/12, smallest_ikey,
|
|
|
|
largest_ikey, 0, 0, false, Temperature::kUnknown, 0, 0, 0,
|
Sort L0 files by newly introduced epoch_num (#10922)
Summary:
**Context:**
Sorting L0 files by `largest_seqno` has at least two inconvenience:
- File ingestion and compaction involving ingested files can create files of overlapping seqno range with the existing files. `force_consistency_check=true` will catch such overlap seqno range even those harmless overlap.
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n")
- insert k1@1 to memtable m1
- ingest file s1 with k2@2, ingest file s2 with k3@3
- insert k4@4 to m1
- compact files s1, s2 and result in new file s3 of seqno range [2, 3]
- flush m1 and result in new file s4 of seqno range [1, 4]. And `force_consistency_check=true` will think s4 and s3 has file reordering corruption that might cause retuning an old value of k1
- However such caught corruption is a false positive since s1, s2 will not have overlapped keys with k1 or whatever inserted into m1 before ingest file s1 by the requirement of file ingestion (otherwise the m1 will be flushed first before any of the file ingestion completes). Therefore there in fact isn't any file reordering corruption.
- Single delete can decrease a file's largest seqno and ordering by `largest_seqno` can introduce a wrong ordering hence file reordering corruption
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n", Credit to ajkr for this example)
- an existing SST s1 contains only k1@1
- insert k1@2 to memtable m1
- ingest file s2 with k3@3, ingest file s3 with k4@4
- insert single delete k5@5 in m1
- flush m1 and result in new file s4 of seqno range [2, 5]
- compact s1, s2, s3 and result in new file s5 of seqno range [1, 4]
- compact s4 and result in new file s6 of seqno range [2] due to single delete
- By the last step, we have file ordering by largest seqno (">" means "newer") : s5 > s6 while s6 contains a newer version of the k1's value (i.e, k1@2) than s5, which is a real reordering corruption. While this can be caught by `force_consistency_check=true`, there isn't a good way to prevent this from happening if ordering by `largest_seqno`
Therefore, we are redesigning the sorting criteria of L0 files and avoid above inconvenience. Credit to ajkr , we now introduce `epoch_num` which describes the order of a file being flushed or ingested/imported (compaction output file will has the minimum `epoch_num` among input files'). This will avoid the above inconvenience in the following ways:
- In the first case above, there will no longer be overlap seqno range check in `force_consistency_check=true` but `epoch_number` ordering check. This will result in file ordering s1 < s2 < s4 (pre-compaction) and s3 < s4 (post-compaction) which won't trigger false positive corruption. See test class `DBCompactionTestL0FilesMisorderCorruption*` for more.
- In the second case above, this will result in file ordering s1 < s2 < s3 < s4 (pre-compacting s1, s2, s3), s5 < s4 (post-compacting s1, s2, s3), s5 < s6 (post-compacting s4), which are correct file ordering without causing any corruption.
**Summary:**
- Introduce `epoch_number` stored per `ColumnFamilyData` and sort CF's L0 files by their assigned `epoch_number` instead of `largest_seqno`.
- `epoch_number` is increased and assigned upon `VersionEdit::AddFile()` for flush (or similarly for WriteLevel0TableForRecovery) and file ingestion (except for allow_behind_true, which will always get assigned as the `kReservedEpochNumberForFileIngestedBehind`)
- Compaction output file is assigned with the minimum `epoch_number` among input files'
- Refit level: reuse refitted file's epoch_number
- Other paths needing `epoch_number` treatment:
- Import column families: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`
- Repair: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`.
- Assigning new epoch_number to a file and adding this file to LSM tree should be atomic. This is guaranteed by us assigning epoch_number right upon `VersionEdit::AddFile()` where this version edit will be apply to LSM tree shape right after by holding the db mutex (e.g, flush, file ingestion, import column family) or by there is only 1 ongoing edit per CF (e.g, WriteLevel0TableForRecovery, Repair).
- Assigning the minimum input epoch number to compaction output file won't misorder L0 files (even through later `Refit(target_level=0)`). It's due to for every key "k" in the input range, a legit compaction will cover a continuous epoch number range of that key. As long as we assign the key "k" the minimum input epoch number, it won't become newer or older than the versions of this key that aren't included in this compaction hence no misorder.
- Persist `epoch_number` of each file in manifest and recover `epoch_number` on db recovery
- Backward compatibility with old db without `epoch_number` support is guaranteed by assigning `epoch_number` to recovered files by `NewestFirstBySeqno` order. See `VersionStorageInfo::RecoverEpochNumbers()` for more
- Forward compatibility with manifest is guaranteed by flexibility of `NewFileCustomTag`
- Replace `force_consistent_check` on L0 with `epoch_number` and remove false positive check like case 1 with `largest_seqno` above
- Due to backward compatibility issue, we might encounter files with missing epoch number at the beginning of db recovery. We will still use old L0 sorting mechanism (`NewestFirstBySeqno`) to check/sort them till we infer their epoch number. See usages of `EpochNumberRequirement`.
- Remove fix https://github.com/facebook/rocksdb/pull/5958#issue-511150930 and their outdated tests to file reordering corruption because such fix can be replaced by this PR.
- Misc:
- update existing tests with `epoch_number` so make check will pass
- update https://github.com/facebook/rocksdb/pull/5958#issue-511150930 tests to verify corruption is fixed using `epoch_number` and cover universal/fifo compaction/CompactRange/CompactFile cases
- assert db_mutex is held for a few places before calling ColumnFamilyData::NewEpochNumber()
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10922
Test Plan:
- `make check`
- New unit tests under `db/db_compaction_test.cc`, `db/db_test2.cc`, `db/version_builder_test.cc`, `db/repair_test.cc`
- Updated tests (i.e, `DBCompactionTestL0FilesMisorderCorruption*`) under https://github.com/facebook/rocksdb/pull/5958#issue-511150930
- [Ongoing] Compatibility test: manually run https://github.com/ajkr/rocksdb/commit/36a5686ec012f35a4371e409aa85c404ca1c210d (with file ingestion off for running the `.orig` binary to prevent this bug affecting upgrade/downgrade formality checking) for 1 hour on `simple black/white box`, `cf_consistency/txn/enable_ts with whitebox + test_best_efforts_recovery with blackbox`
- [Ongoing] normal db stress test
- [Ongoing] db stress test with aggressive value https://github.com/facebook/rocksdb/pull/10761
Reviewed By: ajkr
Differential Revision: D41063187
Pulled By: hx235
fbshipit-source-id: 826cb23455de7beaabe2d16c57682a82733a32a9
2022-12-13 21:29:37 +00:00
|
|
|
file_num /* epoch_number */, kUnknownFileChecksum,
|
2023-06-22 04:49:01 +00:00
|
|
|
kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0,
|
|
|
|
/* user_defined_timestamps_persisted */ true);
|
2020-03-21 02:17:54 +00:00
|
|
|
added_files.emplace_back(0, meta);
|
|
|
|
}
|
|
|
|
WriteFileAdditionAndDeletionToManifest(
|
|
|
|
/*cf=*/0, added_files, std::vector<std::pair<int, uint64_t>>());
|
|
|
|
std::vector<std::pair<int, uint64_t>> deleted_files;
|
|
|
|
deleted_files.emplace_back(0, 10);
|
|
|
|
WriteFileAdditionAndDeletionToManifest(
|
|
|
|
/*cf=*/0, std::vector<std::pair<int, FileMetaData>>(), deleted_files);
|
|
|
|
log_writer_.reset();
|
2024-08-24 02:49:25 +00:00
|
|
|
CreateCurrentFile();
|
2020-03-21 02:17:54 +00:00
|
|
|
std::string manifest_path;
|
|
|
|
VerifyManifest(&manifest_path);
|
|
|
|
std::string db_id;
|
|
|
|
bool has_missing_table_file = false;
|
2024-08-24 02:49:25 +00:00
|
|
|
Status s = versions_->TryRecoverFromOneManifest(
|
|
|
|
manifest_path, column_families_,
|
|
|
|
/*read_only=*/false, &db_id, &has_missing_table_file);
|
2020-03-21 02:17:54 +00:00
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_TRUE(has_missing_table_file);
|
|
|
|
for (ColumnFamilyData* cfd : *(versions_->GetColumnFamilySet())) {
|
|
|
|
VersionStorageInfo* vstorage = cfd->current()->storage_info();
|
|
|
|
const std::vector<FileMetaData*>& files = vstorage->LevelFiles(0);
|
|
|
|
ASSERT_TRUE(files.empty());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(VersionSetTestMissingFiles, ManifestAheadofSst) {
|
|
|
|
std::vector<SstInfo> existing_files = {
|
Sort L0 files by newly introduced epoch_num (#10922)
Summary:
**Context:**
Sorting L0 files by `largest_seqno` has at least two inconvenience:
- File ingestion and compaction involving ingested files can create files of overlapping seqno range with the existing files. `force_consistency_check=true` will catch such overlap seqno range even those harmless overlap.
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n")
- insert k1@1 to memtable m1
- ingest file s1 with k2@2, ingest file s2 with k3@3
- insert k4@4 to m1
- compact files s1, s2 and result in new file s3 of seqno range [2, 3]
- flush m1 and result in new file s4 of seqno range [1, 4]. And `force_consistency_check=true` will think s4 and s3 has file reordering corruption that might cause retuning an old value of k1
- However such caught corruption is a false positive since s1, s2 will not have overlapped keys with k1 or whatever inserted into m1 before ingest file s1 by the requirement of file ingestion (otherwise the m1 will be flushed first before any of the file ingestion completes). Therefore there in fact isn't any file reordering corruption.
- Single delete can decrease a file's largest seqno and ordering by `largest_seqno` can introduce a wrong ordering hence file reordering corruption
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n", Credit to ajkr for this example)
- an existing SST s1 contains only k1@1
- insert k1@2 to memtable m1
- ingest file s2 with k3@3, ingest file s3 with k4@4
- insert single delete k5@5 in m1
- flush m1 and result in new file s4 of seqno range [2, 5]
- compact s1, s2, s3 and result in new file s5 of seqno range [1, 4]
- compact s4 and result in new file s6 of seqno range [2] due to single delete
- By the last step, we have file ordering by largest seqno (">" means "newer") : s5 > s6 while s6 contains a newer version of the k1's value (i.e, k1@2) than s5, which is a real reordering corruption. While this can be caught by `force_consistency_check=true`, there isn't a good way to prevent this from happening if ordering by `largest_seqno`
Therefore, we are redesigning the sorting criteria of L0 files and avoid above inconvenience. Credit to ajkr , we now introduce `epoch_num` which describes the order of a file being flushed or ingested/imported (compaction output file will has the minimum `epoch_num` among input files'). This will avoid the above inconvenience in the following ways:
- In the first case above, there will no longer be overlap seqno range check in `force_consistency_check=true` but `epoch_number` ordering check. This will result in file ordering s1 < s2 < s4 (pre-compaction) and s3 < s4 (post-compaction) which won't trigger false positive corruption. See test class `DBCompactionTestL0FilesMisorderCorruption*` for more.
- In the second case above, this will result in file ordering s1 < s2 < s3 < s4 (pre-compacting s1, s2, s3), s5 < s4 (post-compacting s1, s2, s3), s5 < s6 (post-compacting s4), which are correct file ordering without causing any corruption.
**Summary:**
- Introduce `epoch_number` stored per `ColumnFamilyData` and sort CF's L0 files by their assigned `epoch_number` instead of `largest_seqno`.
- `epoch_number` is increased and assigned upon `VersionEdit::AddFile()` for flush (or similarly for WriteLevel0TableForRecovery) and file ingestion (except for allow_behind_true, which will always get assigned as the `kReservedEpochNumberForFileIngestedBehind`)
- Compaction output file is assigned with the minimum `epoch_number` among input files'
- Refit level: reuse refitted file's epoch_number
- Other paths needing `epoch_number` treatment:
- Import column families: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`
- Repair: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`.
- Assigning new epoch_number to a file and adding this file to LSM tree should be atomic. This is guaranteed by us assigning epoch_number right upon `VersionEdit::AddFile()` where this version edit will be apply to LSM tree shape right after by holding the db mutex (e.g, flush, file ingestion, import column family) or by there is only 1 ongoing edit per CF (e.g, WriteLevel0TableForRecovery, Repair).
- Assigning the minimum input epoch number to compaction output file won't misorder L0 files (even through later `Refit(target_level=0)`). It's due to for every key "k" in the input range, a legit compaction will cover a continuous epoch number range of that key. As long as we assign the key "k" the minimum input epoch number, it won't become newer or older than the versions of this key that aren't included in this compaction hence no misorder.
- Persist `epoch_number` of each file in manifest and recover `epoch_number` on db recovery
- Backward compatibility with old db without `epoch_number` support is guaranteed by assigning `epoch_number` to recovered files by `NewestFirstBySeqno` order. See `VersionStorageInfo::RecoverEpochNumbers()` for more
- Forward compatibility with manifest is guaranteed by flexibility of `NewFileCustomTag`
- Replace `force_consistent_check` on L0 with `epoch_number` and remove false positive check like case 1 with `largest_seqno` above
- Due to backward compatibility issue, we might encounter files with missing epoch number at the beginning of db recovery. We will still use old L0 sorting mechanism (`NewestFirstBySeqno`) to check/sort them till we infer their epoch number. See usages of `EpochNumberRequirement`.
- Remove fix https://github.com/facebook/rocksdb/pull/5958#issue-511150930 and their outdated tests to file reordering corruption because such fix can be replaced by this PR.
- Misc:
- update existing tests with `epoch_number` so make check will pass
- update https://github.com/facebook/rocksdb/pull/5958#issue-511150930 tests to verify corruption is fixed using `epoch_number` and cover universal/fifo compaction/CompactRange/CompactFile cases
- assert db_mutex is held for a few places before calling ColumnFamilyData::NewEpochNumber()
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10922
Test Plan:
- `make check`
- New unit tests under `db/db_compaction_test.cc`, `db/db_test2.cc`, `db/version_builder_test.cc`, `db/repair_test.cc`
- Updated tests (i.e, `DBCompactionTestL0FilesMisorderCorruption*`) under https://github.com/facebook/rocksdb/pull/5958#issue-511150930
- [Ongoing] Compatibility test: manually run https://github.com/ajkr/rocksdb/commit/36a5686ec012f35a4371e409aa85c404ca1c210d (with file ingestion off for running the `.orig` binary to prevent this bug affecting upgrade/downgrade formality checking) for 1 hour on `simple black/white box`, `cf_consistency/txn/enable_ts with whitebox + test_best_efforts_recovery with blackbox`
- [Ongoing] normal db stress test
- [Ongoing] db stress test with aggressive value https://github.com/facebook/rocksdb/pull/10761
Reviewed By: ajkr
Differential Revision: D41063187
Pulled By: hx235
fbshipit-source-id: 826cb23455de7beaabe2d16c57682a82733a32a9
2022-12-13 21:29:37 +00:00
|
|
|
SstInfo(100, kDefaultColumnFamilyName, "a", 0 /* level */,
|
|
|
|
100 /* epoch_number */),
|
|
|
|
SstInfo(102, kDefaultColumnFamilyName, "b", 0 /* level */,
|
|
|
|
102 /* epoch_number */),
|
|
|
|
SstInfo(103, kDefaultColumnFamilyName, "c", 0 /* level */,
|
|
|
|
103 /* epoch_number */),
|
|
|
|
SstInfo(107, kDefaultColumnFamilyName, "d", 0 /* level */,
|
|
|
|
107 /* epoch_number */),
|
|
|
|
SstInfo(110, kDefaultColumnFamilyName, "e", 0 /* level */,
|
|
|
|
110 /* epoch_number */)};
|
2020-03-21 02:17:54 +00:00
|
|
|
std::vector<FileMetaData> file_metas;
|
|
|
|
CreateDummyTableFiles(existing_files, &file_metas);
|
|
|
|
|
|
|
|
PrepareManifest(&column_families_, &last_seqno_, &log_writer_);
|
|
|
|
std::vector<std::pair<int, FileMetaData>> added_files;
|
|
|
|
for (size_t i = 3; i != 5; ++i) {
|
|
|
|
added_files.emplace_back(0, file_metas[i]);
|
|
|
|
}
|
|
|
|
WriteFileAdditionAndDeletionToManifest(
|
|
|
|
/*cf=*/0, added_files, std::vector<std::pair<int, uint64_t>>());
|
|
|
|
|
|
|
|
added_files.clear();
|
|
|
|
for (uint64_t file_num = 120; file_num < 130; ++file_num) {
|
|
|
|
std::string smallest_ukey = "a";
|
|
|
|
std::string largest_ukey = "b";
|
|
|
|
InternalKey smallest_ikey(smallest_ukey, 1, ValueType::kTypeValue);
|
|
|
|
InternalKey largest_ikey(largest_ukey, 1, ValueType::kTypeValue);
|
2021-12-03 22:42:05 +00:00
|
|
|
FileMetaData meta = FileMetaData(
|
|
|
|
file_num, /*file_path_id=*/0, /*file_size=*/12, smallest_ikey,
|
|
|
|
largest_ikey, 0, 0, false, Temperature::kUnknown, 0, 0, 0,
|
Sort L0 files by newly introduced epoch_num (#10922)
Summary:
**Context:**
Sorting L0 files by `largest_seqno` has at least two inconvenience:
- File ingestion and compaction involving ingested files can create files of overlapping seqno range with the existing files. `force_consistency_check=true` will catch such overlap seqno range even those harmless overlap.
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n")
- insert k1@1 to memtable m1
- ingest file s1 with k2@2, ingest file s2 with k3@3
- insert k4@4 to m1
- compact files s1, s2 and result in new file s3 of seqno range [2, 3]
- flush m1 and result in new file s4 of seqno range [1, 4]. And `force_consistency_check=true` will think s4 and s3 has file reordering corruption that might cause retuning an old value of k1
- However such caught corruption is a false positive since s1, s2 will not have overlapped keys with k1 or whatever inserted into m1 before ingest file s1 by the requirement of file ingestion (otherwise the m1 will be flushed first before any of the file ingestion completes). Therefore there in fact isn't any file reordering corruption.
- Single delete can decrease a file's largest seqno and ordering by `largest_seqno` can introduce a wrong ordering hence file reordering corruption
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n", Credit to ajkr for this example)
- an existing SST s1 contains only k1@1
- insert k1@2 to memtable m1
- ingest file s2 with k3@3, ingest file s3 with k4@4
- insert single delete k5@5 in m1
- flush m1 and result in new file s4 of seqno range [2, 5]
- compact s1, s2, s3 and result in new file s5 of seqno range [1, 4]
- compact s4 and result in new file s6 of seqno range [2] due to single delete
- By the last step, we have file ordering by largest seqno (">" means "newer") : s5 > s6 while s6 contains a newer version of the k1's value (i.e, k1@2) than s5, which is a real reordering corruption. While this can be caught by `force_consistency_check=true`, there isn't a good way to prevent this from happening if ordering by `largest_seqno`
Therefore, we are redesigning the sorting criteria of L0 files and avoid above inconvenience. Credit to ajkr , we now introduce `epoch_num` which describes the order of a file being flushed or ingested/imported (compaction output file will has the minimum `epoch_num` among input files'). This will avoid the above inconvenience in the following ways:
- In the first case above, there will no longer be overlap seqno range check in `force_consistency_check=true` but `epoch_number` ordering check. This will result in file ordering s1 < s2 < s4 (pre-compaction) and s3 < s4 (post-compaction) which won't trigger false positive corruption. See test class `DBCompactionTestL0FilesMisorderCorruption*` for more.
- In the second case above, this will result in file ordering s1 < s2 < s3 < s4 (pre-compacting s1, s2, s3), s5 < s4 (post-compacting s1, s2, s3), s5 < s6 (post-compacting s4), which are correct file ordering without causing any corruption.
**Summary:**
- Introduce `epoch_number` stored per `ColumnFamilyData` and sort CF's L0 files by their assigned `epoch_number` instead of `largest_seqno`.
- `epoch_number` is increased and assigned upon `VersionEdit::AddFile()` for flush (or similarly for WriteLevel0TableForRecovery) and file ingestion (except for allow_behind_true, which will always get assigned as the `kReservedEpochNumberForFileIngestedBehind`)
- Compaction output file is assigned with the minimum `epoch_number` among input files'
- Refit level: reuse refitted file's epoch_number
- Other paths needing `epoch_number` treatment:
- Import column families: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`
- Repair: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`.
- Assigning new epoch_number to a file and adding this file to LSM tree should be atomic. This is guaranteed by us assigning epoch_number right upon `VersionEdit::AddFile()` where this version edit will be apply to LSM tree shape right after by holding the db mutex (e.g, flush, file ingestion, import column family) or by there is only 1 ongoing edit per CF (e.g, WriteLevel0TableForRecovery, Repair).
- Assigning the minimum input epoch number to compaction output file won't misorder L0 files (even through later `Refit(target_level=0)`). It's due to for every key "k" in the input range, a legit compaction will cover a continuous epoch number range of that key. As long as we assign the key "k" the minimum input epoch number, it won't become newer or older than the versions of this key that aren't included in this compaction hence no misorder.
- Persist `epoch_number` of each file in manifest and recover `epoch_number` on db recovery
- Backward compatibility with old db without `epoch_number` support is guaranteed by assigning `epoch_number` to recovered files by `NewestFirstBySeqno` order. See `VersionStorageInfo::RecoverEpochNumbers()` for more
- Forward compatibility with manifest is guaranteed by flexibility of `NewFileCustomTag`
- Replace `force_consistent_check` on L0 with `epoch_number` and remove false positive check like case 1 with `largest_seqno` above
- Due to backward compatibility issue, we might encounter files with missing epoch number at the beginning of db recovery. We will still use old L0 sorting mechanism (`NewestFirstBySeqno`) to check/sort them till we infer their epoch number. See usages of `EpochNumberRequirement`.
- Remove fix https://github.com/facebook/rocksdb/pull/5958#issue-511150930 and their outdated tests to file reordering corruption because such fix can be replaced by this PR.
- Misc:
- update existing tests with `epoch_number` so make check will pass
- update https://github.com/facebook/rocksdb/pull/5958#issue-511150930 tests to verify corruption is fixed using `epoch_number` and cover universal/fifo compaction/CompactRange/CompactFile cases
- assert db_mutex is held for a few places before calling ColumnFamilyData::NewEpochNumber()
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10922
Test Plan:
- `make check`
- New unit tests under `db/db_compaction_test.cc`, `db/db_test2.cc`, `db/version_builder_test.cc`, `db/repair_test.cc`
- Updated tests (i.e, `DBCompactionTestL0FilesMisorderCorruption*`) under https://github.com/facebook/rocksdb/pull/5958#issue-511150930
- [Ongoing] Compatibility test: manually run https://github.com/ajkr/rocksdb/commit/36a5686ec012f35a4371e409aa85c404ca1c210d (with file ingestion off for running the `.orig` binary to prevent this bug affecting upgrade/downgrade formality checking) for 1 hour on `simple black/white box`, `cf_consistency/txn/enable_ts with whitebox + test_best_efforts_recovery with blackbox`
- [Ongoing] normal db stress test
- [Ongoing] db stress test with aggressive value https://github.com/facebook/rocksdb/pull/10761
Reviewed By: ajkr
Differential Revision: D41063187
Pulled By: hx235
fbshipit-source-id: 826cb23455de7beaabe2d16c57682a82733a32a9
2022-12-13 21:29:37 +00:00
|
|
|
file_num /* epoch_number */, kUnknownFileChecksum,
|
2023-06-22 04:49:01 +00:00
|
|
|
kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0,
|
|
|
|
/* user_defined_timestamps_persisted */ true);
|
2020-03-21 02:17:54 +00:00
|
|
|
added_files.emplace_back(0, meta);
|
|
|
|
}
|
|
|
|
WriteFileAdditionAndDeletionToManifest(
|
|
|
|
/*cf=*/0, added_files, std::vector<std::pair<int, uint64_t>>());
|
|
|
|
log_writer_.reset();
|
2024-08-24 02:49:25 +00:00
|
|
|
CreateCurrentFile();
|
2020-03-21 02:17:54 +00:00
|
|
|
std::string manifest_path;
|
|
|
|
VerifyManifest(&manifest_path);
|
|
|
|
std::string db_id;
|
|
|
|
bool has_missing_table_file = false;
|
2024-08-24 02:49:25 +00:00
|
|
|
Status s = versions_->TryRecoverFromOneManifest(
|
|
|
|
manifest_path, column_families_,
|
|
|
|
/*read_only=*/false, &db_id, &has_missing_table_file);
|
2020-03-21 02:17:54 +00:00
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_TRUE(has_missing_table_file);
|
|
|
|
for (ColumnFamilyData* cfd : *(versions_->GetColumnFamilySet())) {
|
|
|
|
VersionStorageInfo* vstorage = cfd->current()->storage_info();
|
|
|
|
const std::vector<FileMetaData*>& files = vstorage->LevelFiles(0);
|
|
|
|
if (cfd->GetName() == kDefaultColumnFamilyName) {
|
|
|
|
ASSERT_EQ(2, files.size());
|
|
|
|
for (const auto* fmeta : files) {
|
|
|
|
if (fmeta->fd.GetNumber() != 107 && fmeta->fd.GetNumber() != 110) {
|
|
|
|
ASSERT_FALSE(true);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
ASSERT_TRUE(files.empty());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(VersionSetTestMissingFiles, NoFileMissing) {
|
|
|
|
std::vector<SstInfo> existing_files = {
|
Sort L0 files by newly introduced epoch_num (#10922)
Summary:
**Context:**
Sorting L0 files by `largest_seqno` has at least two inconvenience:
- File ingestion and compaction involving ingested files can create files of overlapping seqno range with the existing files. `force_consistency_check=true` will catch such overlap seqno range even those harmless overlap.
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n")
- insert k1@1 to memtable m1
- ingest file s1 with k2@2, ingest file s2 with k3@3
- insert k4@4 to m1
- compact files s1, s2 and result in new file s3 of seqno range [2, 3]
- flush m1 and result in new file s4 of seqno range [1, 4]. And `force_consistency_check=true` will think s4 and s3 has file reordering corruption that might cause retuning an old value of k1
- However such caught corruption is a false positive since s1, s2 will not have overlapped keys with k1 or whatever inserted into m1 before ingest file s1 by the requirement of file ingestion (otherwise the m1 will be flushed first before any of the file ingestion completes). Therefore there in fact isn't any file reordering corruption.
- Single delete can decrease a file's largest seqno and ordering by `largest_seqno` can introduce a wrong ordering hence file reordering corruption
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n", Credit to ajkr for this example)
- an existing SST s1 contains only k1@1
- insert k1@2 to memtable m1
- ingest file s2 with k3@3, ingest file s3 with k4@4
- insert single delete k5@5 in m1
- flush m1 and result in new file s4 of seqno range [2, 5]
- compact s1, s2, s3 and result in new file s5 of seqno range [1, 4]
- compact s4 and result in new file s6 of seqno range [2] due to single delete
- By the last step, we have file ordering by largest seqno (">" means "newer") : s5 > s6 while s6 contains a newer version of the k1's value (i.e, k1@2) than s5, which is a real reordering corruption. While this can be caught by `force_consistency_check=true`, there isn't a good way to prevent this from happening if ordering by `largest_seqno`
Therefore, we are redesigning the sorting criteria of L0 files and avoid above inconvenience. Credit to ajkr , we now introduce `epoch_num` which describes the order of a file being flushed or ingested/imported (compaction output file will has the minimum `epoch_num` among input files'). This will avoid the above inconvenience in the following ways:
- In the first case above, there will no longer be overlap seqno range check in `force_consistency_check=true` but `epoch_number` ordering check. This will result in file ordering s1 < s2 < s4 (pre-compaction) and s3 < s4 (post-compaction) which won't trigger false positive corruption. See test class `DBCompactionTestL0FilesMisorderCorruption*` for more.
- In the second case above, this will result in file ordering s1 < s2 < s3 < s4 (pre-compacting s1, s2, s3), s5 < s4 (post-compacting s1, s2, s3), s5 < s6 (post-compacting s4), which are correct file ordering without causing any corruption.
**Summary:**
- Introduce `epoch_number` stored per `ColumnFamilyData` and sort CF's L0 files by their assigned `epoch_number` instead of `largest_seqno`.
- `epoch_number` is increased and assigned upon `VersionEdit::AddFile()` for flush (or similarly for WriteLevel0TableForRecovery) and file ingestion (except for allow_behind_true, which will always get assigned as the `kReservedEpochNumberForFileIngestedBehind`)
- Compaction output file is assigned with the minimum `epoch_number` among input files'
- Refit level: reuse refitted file's epoch_number
- Other paths needing `epoch_number` treatment:
- Import column families: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`
- Repair: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`.
- Assigning new epoch_number to a file and adding this file to LSM tree should be atomic. This is guaranteed by us assigning epoch_number right upon `VersionEdit::AddFile()` where this version edit will be apply to LSM tree shape right after by holding the db mutex (e.g, flush, file ingestion, import column family) or by there is only 1 ongoing edit per CF (e.g, WriteLevel0TableForRecovery, Repair).
- Assigning the minimum input epoch number to compaction output file won't misorder L0 files (even through later `Refit(target_level=0)`). It's due to for every key "k" in the input range, a legit compaction will cover a continuous epoch number range of that key. As long as we assign the key "k" the minimum input epoch number, it won't become newer or older than the versions of this key that aren't included in this compaction hence no misorder.
- Persist `epoch_number` of each file in manifest and recover `epoch_number` on db recovery
- Backward compatibility with old db without `epoch_number` support is guaranteed by assigning `epoch_number` to recovered files by `NewestFirstBySeqno` order. See `VersionStorageInfo::RecoverEpochNumbers()` for more
- Forward compatibility with manifest is guaranteed by flexibility of `NewFileCustomTag`
- Replace `force_consistent_check` on L0 with `epoch_number` and remove false positive check like case 1 with `largest_seqno` above
- Due to backward compatibility issue, we might encounter files with missing epoch number at the beginning of db recovery. We will still use old L0 sorting mechanism (`NewestFirstBySeqno`) to check/sort them till we infer their epoch number. See usages of `EpochNumberRequirement`.
- Remove fix https://github.com/facebook/rocksdb/pull/5958#issue-511150930 and their outdated tests to file reordering corruption because such fix can be replaced by this PR.
- Misc:
- update existing tests with `epoch_number` so make check will pass
- update https://github.com/facebook/rocksdb/pull/5958#issue-511150930 tests to verify corruption is fixed using `epoch_number` and cover universal/fifo compaction/CompactRange/CompactFile cases
- assert db_mutex is held for a few places before calling ColumnFamilyData::NewEpochNumber()
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10922
Test Plan:
- `make check`
- New unit tests under `db/db_compaction_test.cc`, `db/db_test2.cc`, `db/version_builder_test.cc`, `db/repair_test.cc`
- Updated tests (i.e, `DBCompactionTestL0FilesMisorderCorruption*`) under https://github.com/facebook/rocksdb/pull/5958#issue-511150930
- [Ongoing] Compatibility test: manually run https://github.com/ajkr/rocksdb/commit/36a5686ec012f35a4371e409aa85c404ca1c210d (with file ingestion off for running the `.orig` binary to prevent this bug affecting upgrade/downgrade formality checking) for 1 hour on `simple black/white box`, `cf_consistency/txn/enable_ts with whitebox + test_best_efforts_recovery with blackbox`
- [Ongoing] normal db stress test
- [Ongoing] db stress test with aggressive value https://github.com/facebook/rocksdb/pull/10761
Reviewed By: ajkr
Differential Revision: D41063187
Pulled By: hx235
fbshipit-source-id: 826cb23455de7beaabe2d16c57682a82733a32a9
2022-12-13 21:29:37 +00:00
|
|
|
SstInfo(100, kDefaultColumnFamilyName, "a", 0 /* level */,
|
|
|
|
100 /* epoch_number */),
|
|
|
|
SstInfo(102, kDefaultColumnFamilyName, "b", 0 /* level */,
|
|
|
|
102 /* epoch_number */),
|
|
|
|
SstInfo(103, kDefaultColumnFamilyName, "c", 0 /* level */,
|
|
|
|
103 /* epoch_number */),
|
|
|
|
SstInfo(107, kDefaultColumnFamilyName, "d", 0 /* level */,
|
|
|
|
107 /* epoch_number */),
|
|
|
|
SstInfo(110, kDefaultColumnFamilyName, "e", 0 /* level */,
|
|
|
|
110 /* epoch_number */)};
|
2020-03-21 02:17:54 +00:00
|
|
|
std::vector<FileMetaData> file_metas;
|
|
|
|
CreateDummyTableFiles(existing_files, &file_metas);
|
|
|
|
|
|
|
|
PrepareManifest(&column_families_, &last_seqno_, &log_writer_);
|
|
|
|
std::vector<std::pair<int, FileMetaData>> added_files;
|
|
|
|
for (const auto& meta : file_metas) {
|
|
|
|
added_files.emplace_back(0, meta);
|
|
|
|
}
|
|
|
|
WriteFileAdditionAndDeletionToManifest(
|
|
|
|
/*cf=*/0, added_files, std::vector<std::pair<int, uint64_t>>());
|
|
|
|
std::vector<std::pair<int, uint64_t>> deleted_files;
|
|
|
|
deleted_files.emplace_back(/*level=*/0, 100);
|
|
|
|
WriteFileAdditionAndDeletionToManifest(
|
|
|
|
/*cf=*/0, std::vector<std::pair<int, FileMetaData>>(), deleted_files);
|
|
|
|
log_writer_.reset();
|
2024-08-24 02:49:25 +00:00
|
|
|
CreateCurrentFile();
|
2020-03-21 02:17:54 +00:00
|
|
|
std::string manifest_path;
|
|
|
|
VerifyManifest(&manifest_path);
|
|
|
|
std::string db_id;
|
|
|
|
bool has_missing_table_file = false;
|
2024-08-24 02:49:25 +00:00
|
|
|
Status s = versions_->TryRecoverFromOneManifest(
|
|
|
|
manifest_path, column_families_,
|
|
|
|
/*read_only=*/false, &db_id, &has_missing_table_file);
|
2020-03-21 02:17:54 +00:00
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_FALSE(has_missing_table_file);
|
|
|
|
for (ColumnFamilyData* cfd : *(versions_->GetColumnFamilySet())) {
|
|
|
|
VersionStorageInfo* vstorage = cfd->current()->storage_info();
|
|
|
|
const std::vector<FileMetaData*>& files = vstorage->LevelFiles(0);
|
|
|
|
if (cfd->GetName() == kDefaultColumnFamilyName) {
|
|
|
|
ASSERT_EQ(existing_files.size() - deleted_files.size(), files.size());
|
|
|
|
bool has_deleted_file = false;
|
|
|
|
for (const auto* fmeta : files) {
|
|
|
|
if (fmeta->fd.GetNumber() == 100) {
|
|
|
|
has_deleted_file = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
ASSERT_FALSE(has_deleted_file);
|
|
|
|
} else {
|
|
|
|
ASSERT_TRUE(files.empty());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-12-10 03:05:14 +00:00
|
|
|
TEST_F(VersionSetTestMissingFiles, MinLogNumberToKeep2PC) {
|
Fix a race condition in WAL tracking causing DB open failure (#9715)
Summary:
There is a race condition if WAL tracking in the MANIFEST is enabled in a database that disables 2PC.
The race condition is between two background flush threads trying to install flush results to the MANIFEST.
Consider an example database with two column families: "default" (cfd0) and "cf1" (cfd1). Initially,
both column families have one mutable (active) memtable whose data backed by 6.log.
1. Trigger a manual flush for "cf1", creating a 7.log
2. Insert another key to "default", and trigger flush for "default", creating 8.log
3. BgFlushThread1 finishes writing 9.sst
4. BgFlushThread2 finishes writing 10.sst
```
Time BgFlushThread1 BgFlushThread2
| mutex_.Lock()
| precompute min_wal_to_keep as 6
| mutex_.Unlock()
| mutex_.Lock()
| precompute min_wal_to_keep as 6
| join MANIFEST write queue and mutex_.Unlock()
| write to MANIFEST
| mutex_.Lock()
| cfd1->log_number = 7
| Signal bg_flush_2 and mutex_.Unlock()
| wake up and mutex_.Lock()
| cfd0->log_number = 8
| FindObsoleteFiles() with job_context->log_number == 7
| mutex_.Unlock()
| PurgeObsoleteFiles() deletes 6.log
V
```
As shown in the above, BgFlushThread2 thinks that the min wal to keep is 6.log because "cf1" has unflushed data in 6.log (cf1.log_number=6).
Similarly, BgThread1 thinks that min wal to keep is also 6.log because "default" has unflushed data (default.log_number=6).
No WAL deletion will be written to MANIFEST because 6 is equal to `versions_->wals_.min_wal_number_to_keep`,
due to https://github.com/facebook/rocksdb/blob/7.1.fb/db/memtable_list.cc#L513:L514.
The bg flush thread that finishes last will perform file purging. `job_context.log_number` will be evaluated as 7, i.e.
the min wal that contains unflushed data, causing 6.log to be deleted. However, MANIFEST thinks 6.log should still exist.
If you close the db at this point, you won't be able to re-open it if `track_and_verify_wal_in_manifest` is true.
We must handle the case of multiple bg flush threads, and it is difficult for one bg flush thread to know
the correct min wal number until the other bg flush threads have finished committing to the manifest and updated
the `cfd::log_number`.
To fix this issue, we rename an existing variable `min_log_number_to_keep_2pc` to `min_log_number_to_keep`,
and use it to track WAL file deletion in non-2pc mode as well.
This variable is updated only 1) during recovery with mutex held, or 2) in the MANIFEST write thread.
`min_log_number_to_keep` means RocksDB will delete WALs below it, although there may be WALs
above it which are also obsolete. Formally, we will have [min_wal_to_keep, max_obsolete_wal]. During recovery, we
make sure that only WALs above max_obsolete_wal are checked and added back to `alive_log_files_`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9715
Test Plan:
```
make check
```
Also ran stress test below (with asan) to make sure it completes successfully.
```
TEST_TMPDIR=/dev/shm/rocksdb OPT=-g ASAN_OPTIONS=disable_coredump=0 \
CRASH_TEST_EXT_ARGS=--compression_type=zstd SKIP_FORMAT_BUCK_CHECKS=1 \
make J=52 -j52 blackbox_asan_crash_test
```
Reviewed By: ltamasi
Differential Revision: D34984412
Pulled By: riversand963
fbshipit-source-id: c7b21a8d84751bb55ea79c9f387103d21b231005
2022-03-24 02:41:31 +00:00
|
|
|
db_options_.allow_2pc = true;
|
2020-12-10 03:05:14 +00:00
|
|
|
NewDB();
|
|
|
|
|
Sort L0 files by newly introduced epoch_num (#10922)
Summary:
**Context:**
Sorting L0 files by `largest_seqno` has at least two inconvenience:
- File ingestion and compaction involving ingested files can create files of overlapping seqno range with the existing files. `force_consistency_check=true` will catch such overlap seqno range even those harmless overlap.
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n")
- insert k1@1 to memtable m1
- ingest file s1 with k2@2, ingest file s2 with k3@3
- insert k4@4 to m1
- compact files s1, s2 and result in new file s3 of seqno range [2, 3]
- flush m1 and result in new file s4 of seqno range [1, 4]. And `force_consistency_check=true` will think s4 and s3 has file reordering corruption that might cause retuning an old value of k1
- However such caught corruption is a false positive since s1, s2 will not have overlapped keys with k1 or whatever inserted into m1 before ingest file s1 by the requirement of file ingestion (otherwise the m1 will be flushed first before any of the file ingestion completes). Therefore there in fact isn't any file reordering corruption.
- Single delete can decrease a file's largest seqno and ordering by `largest_seqno` can introduce a wrong ordering hence file reordering corruption
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n", Credit to ajkr for this example)
- an existing SST s1 contains only k1@1
- insert k1@2 to memtable m1
- ingest file s2 with k3@3, ingest file s3 with k4@4
- insert single delete k5@5 in m1
- flush m1 and result in new file s4 of seqno range [2, 5]
- compact s1, s2, s3 and result in new file s5 of seqno range [1, 4]
- compact s4 and result in new file s6 of seqno range [2] due to single delete
- By the last step, we have file ordering by largest seqno (">" means "newer") : s5 > s6 while s6 contains a newer version of the k1's value (i.e, k1@2) than s5, which is a real reordering corruption. While this can be caught by `force_consistency_check=true`, there isn't a good way to prevent this from happening if ordering by `largest_seqno`
Therefore, we are redesigning the sorting criteria of L0 files and avoid above inconvenience. Credit to ajkr , we now introduce `epoch_num` which describes the order of a file being flushed or ingested/imported (compaction output file will has the minimum `epoch_num` among input files'). This will avoid the above inconvenience in the following ways:
- In the first case above, there will no longer be overlap seqno range check in `force_consistency_check=true` but `epoch_number` ordering check. This will result in file ordering s1 < s2 < s4 (pre-compaction) and s3 < s4 (post-compaction) which won't trigger false positive corruption. See test class `DBCompactionTestL0FilesMisorderCorruption*` for more.
- In the second case above, this will result in file ordering s1 < s2 < s3 < s4 (pre-compacting s1, s2, s3), s5 < s4 (post-compacting s1, s2, s3), s5 < s6 (post-compacting s4), which are correct file ordering without causing any corruption.
**Summary:**
- Introduce `epoch_number` stored per `ColumnFamilyData` and sort CF's L0 files by their assigned `epoch_number` instead of `largest_seqno`.
- `epoch_number` is increased and assigned upon `VersionEdit::AddFile()` for flush (or similarly for WriteLevel0TableForRecovery) and file ingestion (except for allow_behind_true, which will always get assigned as the `kReservedEpochNumberForFileIngestedBehind`)
- Compaction output file is assigned with the minimum `epoch_number` among input files'
- Refit level: reuse refitted file's epoch_number
- Other paths needing `epoch_number` treatment:
- Import column families: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`
- Repair: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`.
- Assigning new epoch_number to a file and adding this file to LSM tree should be atomic. This is guaranteed by us assigning epoch_number right upon `VersionEdit::AddFile()` where this version edit will be apply to LSM tree shape right after by holding the db mutex (e.g, flush, file ingestion, import column family) or by there is only 1 ongoing edit per CF (e.g, WriteLevel0TableForRecovery, Repair).
- Assigning the minimum input epoch number to compaction output file won't misorder L0 files (even through later `Refit(target_level=0)`). It's due to for every key "k" in the input range, a legit compaction will cover a continuous epoch number range of that key. As long as we assign the key "k" the minimum input epoch number, it won't become newer or older than the versions of this key that aren't included in this compaction hence no misorder.
- Persist `epoch_number` of each file in manifest and recover `epoch_number` on db recovery
- Backward compatibility with old db without `epoch_number` support is guaranteed by assigning `epoch_number` to recovered files by `NewestFirstBySeqno` order. See `VersionStorageInfo::RecoverEpochNumbers()` for more
- Forward compatibility with manifest is guaranteed by flexibility of `NewFileCustomTag`
- Replace `force_consistent_check` on L0 with `epoch_number` and remove false positive check like case 1 with `largest_seqno` above
- Due to backward compatibility issue, we might encounter files with missing epoch number at the beginning of db recovery. We will still use old L0 sorting mechanism (`NewestFirstBySeqno`) to check/sort them till we infer their epoch number. See usages of `EpochNumberRequirement`.
- Remove fix https://github.com/facebook/rocksdb/pull/5958#issue-511150930 and their outdated tests to file reordering corruption because such fix can be replaced by this PR.
- Misc:
- update existing tests with `epoch_number` so make check will pass
- update https://github.com/facebook/rocksdb/pull/5958#issue-511150930 tests to verify corruption is fixed using `epoch_number` and cover universal/fifo compaction/CompactRange/CompactFile cases
- assert db_mutex is held for a few places before calling ColumnFamilyData::NewEpochNumber()
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10922
Test Plan:
- `make check`
- New unit tests under `db/db_compaction_test.cc`, `db/db_test2.cc`, `db/version_builder_test.cc`, `db/repair_test.cc`
- Updated tests (i.e, `DBCompactionTestL0FilesMisorderCorruption*`) under https://github.com/facebook/rocksdb/pull/5958#issue-511150930
- [Ongoing] Compatibility test: manually run https://github.com/ajkr/rocksdb/commit/36a5686ec012f35a4371e409aa85c404ca1c210d (with file ingestion off for running the `.orig` binary to prevent this bug affecting upgrade/downgrade formality checking) for 1 hour on `simple black/white box`, `cf_consistency/txn/enable_ts with whitebox + test_best_efforts_recovery with blackbox`
- [Ongoing] normal db stress test
- [Ongoing] db stress test with aggressive value https://github.com/facebook/rocksdb/pull/10761
Reviewed By: ajkr
Differential Revision: D41063187
Pulled By: hx235
fbshipit-source-id: 826cb23455de7beaabe2d16c57682a82733a32a9
2022-12-13 21:29:37 +00:00
|
|
|
SstInfo sst(100, kDefaultColumnFamilyName, "a", 0 /* level */,
|
|
|
|
100 /* epoch_number */);
|
2020-12-10 03:05:14 +00:00
|
|
|
std::vector<FileMetaData> file_metas;
|
|
|
|
CreateDummyTableFiles({sst}, &file_metas);
|
|
|
|
|
|
|
|
constexpr WalNumber kMinWalNumberToKeep2PC = 10;
|
|
|
|
VersionEdit edit;
|
|
|
|
edit.AddFile(0, file_metas[0]);
|
|
|
|
edit.SetMinLogNumberToKeep(kMinWalNumberToKeep2PC);
|
|
|
|
ASSERT_OK(LogAndApplyToDefaultCF(edit));
|
Fix a race condition in WAL tracking causing DB open failure (#9715)
Summary:
There is a race condition if WAL tracking in the MANIFEST is enabled in a database that disables 2PC.
The race condition is between two background flush threads trying to install flush results to the MANIFEST.
Consider an example database with two column families: "default" (cfd0) and "cf1" (cfd1). Initially,
both column families have one mutable (active) memtable whose data backed by 6.log.
1. Trigger a manual flush for "cf1", creating a 7.log
2. Insert another key to "default", and trigger flush for "default", creating 8.log
3. BgFlushThread1 finishes writing 9.sst
4. BgFlushThread2 finishes writing 10.sst
```
Time BgFlushThread1 BgFlushThread2
| mutex_.Lock()
| precompute min_wal_to_keep as 6
| mutex_.Unlock()
| mutex_.Lock()
| precompute min_wal_to_keep as 6
| join MANIFEST write queue and mutex_.Unlock()
| write to MANIFEST
| mutex_.Lock()
| cfd1->log_number = 7
| Signal bg_flush_2 and mutex_.Unlock()
| wake up and mutex_.Lock()
| cfd0->log_number = 8
| FindObsoleteFiles() with job_context->log_number == 7
| mutex_.Unlock()
| PurgeObsoleteFiles() deletes 6.log
V
```
As shown in the above, BgFlushThread2 thinks that the min wal to keep is 6.log because "cf1" has unflushed data in 6.log (cf1.log_number=6).
Similarly, BgThread1 thinks that min wal to keep is also 6.log because "default" has unflushed data (default.log_number=6).
No WAL deletion will be written to MANIFEST because 6 is equal to `versions_->wals_.min_wal_number_to_keep`,
due to https://github.com/facebook/rocksdb/blob/7.1.fb/db/memtable_list.cc#L513:L514.
The bg flush thread that finishes last will perform file purging. `job_context.log_number` will be evaluated as 7, i.e.
the min wal that contains unflushed data, causing 6.log to be deleted. However, MANIFEST thinks 6.log should still exist.
If you close the db at this point, you won't be able to re-open it if `track_and_verify_wal_in_manifest` is true.
We must handle the case of multiple bg flush threads, and it is difficult for one bg flush thread to know
the correct min wal number until the other bg flush threads have finished committing to the manifest and updated
the `cfd::log_number`.
To fix this issue, we rename an existing variable `min_log_number_to_keep_2pc` to `min_log_number_to_keep`,
and use it to track WAL file deletion in non-2pc mode as well.
This variable is updated only 1) during recovery with mutex held, or 2) in the MANIFEST write thread.
`min_log_number_to_keep` means RocksDB will delete WALs below it, although there may be WALs
above it which are also obsolete. Formally, we will have [min_wal_to_keep, max_obsolete_wal]. During recovery, we
make sure that only WALs above max_obsolete_wal are checked and added back to `alive_log_files_`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9715
Test Plan:
```
make check
```
Also ran stress test below (with asan) to make sure it completes successfully.
```
TEST_TMPDIR=/dev/shm/rocksdb OPT=-g ASAN_OPTIONS=disable_coredump=0 \
CRASH_TEST_EXT_ARGS=--compression_type=zstd SKIP_FORMAT_BUCK_CHECKS=1 \
make J=52 -j52 blackbox_asan_crash_test
```
Reviewed By: ltamasi
Differential Revision: D34984412
Pulled By: riversand963
fbshipit-source-id: c7b21a8d84751bb55ea79c9f387103d21b231005
2022-03-24 02:41:31 +00:00
|
|
|
ASSERT_EQ(versions_->min_log_number_to_keep(), kMinWalNumberToKeep2PC);
|
2020-12-10 03:05:14 +00:00
|
|
|
|
|
|
|
for (int i = 0; i < 3; i++) {
|
|
|
|
CreateNewManifest();
|
|
|
|
ReopenDB();
|
Fix a race condition in WAL tracking causing DB open failure (#9715)
Summary:
There is a race condition if WAL tracking in the MANIFEST is enabled in a database that disables 2PC.
The race condition is between two background flush threads trying to install flush results to the MANIFEST.
Consider an example database with two column families: "default" (cfd0) and "cf1" (cfd1). Initially,
both column families have one mutable (active) memtable whose data backed by 6.log.
1. Trigger a manual flush for "cf1", creating a 7.log
2. Insert another key to "default", and trigger flush for "default", creating 8.log
3. BgFlushThread1 finishes writing 9.sst
4. BgFlushThread2 finishes writing 10.sst
```
Time BgFlushThread1 BgFlushThread2
| mutex_.Lock()
| precompute min_wal_to_keep as 6
| mutex_.Unlock()
| mutex_.Lock()
| precompute min_wal_to_keep as 6
| join MANIFEST write queue and mutex_.Unlock()
| write to MANIFEST
| mutex_.Lock()
| cfd1->log_number = 7
| Signal bg_flush_2 and mutex_.Unlock()
| wake up and mutex_.Lock()
| cfd0->log_number = 8
| FindObsoleteFiles() with job_context->log_number == 7
| mutex_.Unlock()
| PurgeObsoleteFiles() deletes 6.log
V
```
As shown in the above, BgFlushThread2 thinks that the min wal to keep is 6.log because "cf1" has unflushed data in 6.log (cf1.log_number=6).
Similarly, BgThread1 thinks that min wal to keep is also 6.log because "default" has unflushed data (default.log_number=6).
No WAL deletion will be written to MANIFEST because 6 is equal to `versions_->wals_.min_wal_number_to_keep`,
due to https://github.com/facebook/rocksdb/blob/7.1.fb/db/memtable_list.cc#L513:L514.
The bg flush thread that finishes last will perform file purging. `job_context.log_number` will be evaluated as 7, i.e.
the min wal that contains unflushed data, causing 6.log to be deleted. However, MANIFEST thinks 6.log should still exist.
If you close the db at this point, you won't be able to re-open it if `track_and_verify_wal_in_manifest` is true.
We must handle the case of multiple bg flush threads, and it is difficult for one bg flush thread to know
the correct min wal number until the other bg flush threads have finished committing to the manifest and updated
the `cfd::log_number`.
To fix this issue, we rename an existing variable `min_log_number_to_keep_2pc` to `min_log_number_to_keep`,
and use it to track WAL file deletion in non-2pc mode as well.
This variable is updated only 1) during recovery with mutex held, or 2) in the MANIFEST write thread.
`min_log_number_to_keep` means RocksDB will delete WALs below it, although there may be WALs
above it which are also obsolete. Formally, we will have [min_wal_to_keep, max_obsolete_wal]. During recovery, we
make sure that only WALs above max_obsolete_wal are checked and added back to `alive_log_files_`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9715
Test Plan:
```
make check
```
Also ran stress test below (with asan) to make sure it completes successfully.
```
TEST_TMPDIR=/dev/shm/rocksdb OPT=-g ASAN_OPTIONS=disable_coredump=0 \
CRASH_TEST_EXT_ARGS=--compression_type=zstd SKIP_FORMAT_BUCK_CHECKS=1 \
make J=52 -j52 blackbox_asan_crash_test
```
Reviewed By: ltamasi
Differential Revision: D34984412
Pulled By: riversand963
fbshipit-source-id: c7b21a8d84751bb55ea79c9f387103d21b231005
2022-03-24 02:41:31 +00:00
|
|
|
ASSERT_EQ(versions_->min_log_number_to_keep(), kMinWalNumberToKeep2PC);
|
2020-12-10 03:05:14 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-08-17 00:18:54 +00:00
|
|
|
class BestEffortsRecoverIncompleteVersionTest
|
|
|
|
: public VersionSetTestMissingFiles {
|
|
|
|
public:
|
|
|
|
BestEffortsRecoverIncompleteVersionTest()
|
|
|
|
: VersionSetTestMissingFiles("best_efforts_recover_incomplete_version") {}
|
|
|
|
|
|
|
|
struct BlobInfo {
|
|
|
|
uint64_t file_number;
|
|
|
|
bool file_missing;
|
|
|
|
std::string key;
|
|
|
|
std::string blob;
|
|
|
|
BlobInfo(uint64_t _file_number, bool _file_missing, std::string _key,
|
|
|
|
std::string _blob)
|
|
|
|
: file_number(_file_number),
|
|
|
|
file_missing(_file_missing),
|
|
|
|
key(_key),
|
|
|
|
blob(_blob) {}
|
|
|
|
};
|
|
|
|
|
|
|
|
void CreateDummyBlobFiles(const std::vector<BlobInfo>& infos,
|
|
|
|
std::vector<BlobFileAddition>* blob_metas) {
|
|
|
|
for (const auto& info : infos) {
|
|
|
|
if (!info.file_missing) {
|
|
|
|
WriteDummyBlobFile(info.file_number, info.key, info.blob);
|
|
|
|
}
|
|
|
|
blob_metas->emplace_back(
|
|
|
|
info.file_number, 1 /*total_blob_count*/,
|
|
|
|
info.key.size() + info.blob.size() /*total_blob_bytes*/,
|
|
|
|
"" /*checksum_method*/, "" /*check_sum_value*/);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// Creates a test blob file that is valid so it can pass the
|
|
|
|
// `VersionEditHandlerPointInTime::VerifyBlobFile` check.
|
|
|
|
void WriteDummyBlobFile(uint64_t blob_file_number, const Slice& key,
|
|
|
|
const Slice& blob) {
|
|
|
|
ImmutableOptions options;
|
|
|
|
std::string blob_file_path = BlobFileName(dbname_, blob_file_number);
|
|
|
|
|
|
|
|
std::unique_ptr<FSWritableFile> file;
|
|
|
|
ASSERT_OK(
|
|
|
|
fs_->NewWritableFile(blob_file_path, FileOptions(), &file, nullptr));
|
|
|
|
|
|
|
|
std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
|
|
|
|
std::move(file), blob_file_path, FileOptions(), options.clock));
|
|
|
|
|
|
|
|
BlobLogWriter blob_log_writer(std::move(file_writer), options.clock,
|
|
|
|
/*statistics*/ nullptr, blob_file_number,
|
|
|
|
/*use_fsync*/ true,
|
|
|
|
/*do_flush*/ false);
|
|
|
|
|
|
|
|
constexpr ExpirationRange expiration_range;
|
|
|
|
BlobLogHeader header(/*column_family_id*/ 0, kNoCompression,
|
|
|
|
/*has_ttl*/ false, expiration_range);
|
|
|
|
ASSERT_OK(blob_log_writer.WriteHeader(WriteOptions(), header));
|
|
|
|
std::string compressed_blob;
|
|
|
|
uint64_t key_offset = 0;
|
|
|
|
uint64_t blob_offset = 0;
|
|
|
|
ASSERT_OK(blob_log_writer.AddRecord(WriteOptions(), key, blob, &key_offset,
|
|
|
|
&blob_offset));
|
|
|
|
BlobLogFooter footer;
|
|
|
|
footer.blob_count = 1;
|
|
|
|
footer.expiration_range = expiration_range;
|
|
|
|
std::string checksum_method;
|
|
|
|
std::string checksum_value;
|
|
|
|
ASSERT_OK(blob_log_writer.AppendFooter(WriteOptions(), footer,
|
|
|
|
&checksum_method, &checksum_value));
|
|
|
|
}
|
|
|
|
|
|
|
|
void RecoverFromManifestWithMissingFiles(
|
|
|
|
const std::vector<std::pair<int, FileMetaData>>& added_files,
|
|
|
|
const std::vector<BlobFileAddition>& blob_files) {
|
|
|
|
PrepareManifest(&column_families_, &last_seqno_, &log_writer_);
|
|
|
|
WriteFileAdditionAndDeletionToManifest(
|
|
|
|
/*cf=*/0, added_files, std::vector<std::pair<int, uint64_t>>(),
|
|
|
|
blob_files);
|
|
|
|
log_writer_.reset();
|
2024-08-24 02:49:25 +00:00
|
|
|
CreateCurrentFile();
|
2024-08-17 00:18:54 +00:00
|
|
|
std::string manifest_path;
|
|
|
|
VerifyManifest(&manifest_path);
|
|
|
|
std::string db_id;
|
|
|
|
bool has_missing_table_file = false;
|
2024-08-24 02:49:25 +00:00
|
|
|
Status s = versions_->TryRecoverFromOneManifest(
|
|
|
|
manifest_path, column_families_,
|
|
|
|
/*read_only=*/false, &db_id, &has_missing_table_file);
|
2024-08-17 00:18:54 +00:00
|
|
|
ASSERT_OK(s);
|
|
|
|
ASSERT_TRUE(has_missing_table_file);
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
TEST_F(BestEffortsRecoverIncompleteVersionTest, NonL0MissingFiles) {
|
|
|
|
std::vector<SstInfo> sst_files = {
|
|
|
|
SstInfo(100, kDefaultColumnFamilyName, "a", 1 /* level */,
|
|
|
|
100 /* epoch_number */, true /* file_missing */),
|
|
|
|
SstInfo(101, kDefaultColumnFamilyName, "a", 0 /* level */,
|
|
|
|
101 /* epoch_number */, false /* file_missing */),
|
|
|
|
SstInfo(102, kDefaultColumnFamilyName, "a", 0 /* level */,
|
|
|
|
102 /* epoch_number */, false /* file_missing */),
|
|
|
|
};
|
|
|
|
std::vector<FileMetaData> file_metas;
|
|
|
|
CreateDummyTableFiles(sst_files, &file_metas);
|
|
|
|
|
|
|
|
std::vector<std::pair<int, FileMetaData>> added_files;
|
|
|
|
for (size_t i = 0; i < sst_files.size(); i++) {
|
|
|
|
const auto& info = sst_files[i];
|
|
|
|
const auto& meta = file_metas[i];
|
|
|
|
added_files.emplace_back(info.level, meta);
|
|
|
|
}
|
|
|
|
RecoverFromManifestWithMissingFiles(added_files,
|
|
|
|
std::vector<BlobFileAddition>());
|
|
|
|
std::vector<uint64_t> all_table_files;
|
|
|
|
std::vector<uint64_t> all_blob_files;
|
|
|
|
versions_->AddLiveFiles(&all_table_files, &all_blob_files);
|
|
|
|
ASSERT_TRUE(all_table_files.empty());
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(BestEffortsRecoverIncompleteVersionTest, MissingNonSuffixL0Files) {
|
|
|
|
std::vector<SstInfo> sst_files = {
|
|
|
|
SstInfo(100, kDefaultColumnFamilyName, "a", 1 /* level */,
|
|
|
|
100 /* epoch_number */, false /* file_missing */),
|
|
|
|
SstInfo(101, kDefaultColumnFamilyName, "a", 0 /* level */,
|
|
|
|
101 /* epoch_number */, true /* file_missing */),
|
|
|
|
SstInfo(102, kDefaultColumnFamilyName, "a", 0 /* level */,
|
|
|
|
102 /* epoch_number */, false /* file_missing */),
|
|
|
|
};
|
|
|
|
std::vector<FileMetaData> file_metas;
|
|
|
|
CreateDummyTableFiles(sst_files, &file_metas);
|
|
|
|
|
|
|
|
std::vector<std::pair<int, FileMetaData>> added_files;
|
|
|
|
for (size_t i = 0; i < sst_files.size(); i++) {
|
|
|
|
const auto& info = sst_files[i];
|
|
|
|
const auto& meta = file_metas[i];
|
|
|
|
added_files.emplace_back(info.level, meta);
|
|
|
|
}
|
|
|
|
RecoverFromManifestWithMissingFiles(added_files,
|
|
|
|
std::vector<BlobFileAddition>());
|
|
|
|
std::vector<uint64_t> all_table_files;
|
|
|
|
std::vector<uint64_t> all_blob_files;
|
|
|
|
versions_->AddLiveFiles(&all_table_files, &all_blob_files);
|
|
|
|
ASSERT_TRUE(all_table_files.empty());
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(BestEffortsRecoverIncompleteVersionTest, MissingBlobFiles) {
|
|
|
|
std::vector<SstInfo> sst_files = {
|
|
|
|
SstInfo(100, kDefaultColumnFamilyName, "a", 0 /* level */,
|
|
|
|
100 /* epoch_number */, false /* file_missing */,
|
|
|
|
102 /*oldest_blob_file_number*/),
|
|
|
|
SstInfo(101, kDefaultColumnFamilyName, "a", 0 /* level */,
|
|
|
|
101 /* epoch_number */, false /* file_missing */,
|
|
|
|
103 /*oldest_blob_file_number*/),
|
|
|
|
};
|
|
|
|
std::vector<FileMetaData> file_metas;
|
|
|
|
CreateDummyTableFiles(sst_files, &file_metas);
|
|
|
|
|
|
|
|
std::vector<BlobInfo> blob_files = {
|
|
|
|
BlobInfo(102, true /*file_missing*/, "a", "blob1"),
|
|
|
|
BlobInfo(103, true /*file_missing*/, "a", "blob2"),
|
|
|
|
};
|
|
|
|
std::vector<BlobFileAddition> blob_meta;
|
|
|
|
CreateDummyBlobFiles(blob_files, &blob_meta);
|
|
|
|
|
|
|
|
std::vector<std::pair<int, FileMetaData>> added_files;
|
|
|
|
for (size_t i = 0; i < sst_files.size(); i++) {
|
|
|
|
const auto& info = sst_files[i];
|
|
|
|
const auto& meta = file_metas[i];
|
|
|
|
added_files.emplace_back(info.level, meta);
|
|
|
|
}
|
|
|
|
RecoverFromManifestWithMissingFiles(added_files, blob_meta);
|
|
|
|
std::vector<uint64_t> all_table_files;
|
|
|
|
std::vector<uint64_t> all_blob_files;
|
|
|
|
versions_->AddLiveFiles(&all_table_files, &all_blob_files);
|
|
|
|
ASSERT_TRUE(all_table_files.empty());
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(BestEffortsRecoverIncompleteVersionTest, MissingL0SuffixOnly) {
|
|
|
|
std::vector<SstInfo> sst_files = {
|
|
|
|
SstInfo(100, kDefaultColumnFamilyName, "a", 1 /* level */,
|
|
|
|
100 /* epoch_number */, false /* file_missing */),
|
|
|
|
SstInfo(101, kDefaultColumnFamilyName, "a", 0 /* level */,
|
|
|
|
101 /* epoch_number */, false /* file_missing */),
|
|
|
|
SstInfo(102, kDefaultColumnFamilyName, "a", 0 /* level */,
|
|
|
|
102 /* epoch_number */, true /* file_missing */),
|
|
|
|
};
|
|
|
|
std::vector<FileMetaData> file_metas;
|
|
|
|
CreateDummyTableFiles(sst_files, &file_metas);
|
|
|
|
|
|
|
|
std::vector<std::pair<int, FileMetaData>> added_files;
|
|
|
|
for (size_t i = 0; i < sst_files.size(); i++) {
|
|
|
|
const auto& info = sst_files[i];
|
|
|
|
const auto& meta = file_metas[i];
|
|
|
|
added_files.emplace_back(info.level, meta);
|
|
|
|
}
|
|
|
|
RecoverFromManifestWithMissingFiles(added_files,
|
|
|
|
std::vector<BlobFileAddition>());
|
|
|
|
std::vector<uint64_t> all_table_files;
|
|
|
|
std::vector<uint64_t> all_blob_files;
|
|
|
|
versions_->AddLiveFiles(&all_table_files, &all_blob_files);
|
|
|
|
ASSERT_EQ(2, all_table_files.size());
|
|
|
|
ColumnFamilyData* cfd = versions_->GetColumnFamilySet()->GetDefault();
|
|
|
|
VersionStorageInfo* vstorage = cfd->current()->storage_info();
|
|
|
|
ASSERT_EQ(1, vstorage->LevelFiles(0).size());
|
|
|
|
ASSERT_EQ(1, vstorage->LevelFiles(1).size());
|
|
|
|
}
|
|
|
|
|
|
|
|
TEST_F(BestEffortsRecoverIncompleteVersionTest,
|
|
|
|
MissingL0SuffixAndTheirBlobFiles) {
|
|
|
|
std::vector<SstInfo> sst_files = {
|
|
|
|
SstInfo(100, kDefaultColumnFamilyName, "a", 1 /* level */,
|
|
|
|
100 /* epoch_number */, false /* file_missing */),
|
|
|
|
SstInfo(101, kDefaultColumnFamilyName, "a", 0 /* level */,
|
|
|
|
101 /* epoch_number */, false /* file_missing */,
|
|
|
|
103 /*oldest_blob_file_number*/),
|
|
|
|
SstInfo(102, kDefaultColumnFamilyName, "a", 0 /* level */,
|
|
|
|
102 /* epoch_number */, true /* file_missing */,
|
|
|
|
104 /*oldest_blob_file_number*/),
|
|
|
|
};
|
|
|
|
std::vector<FileMetaData> file_metas;
|
|
|
|
CreateDummyTableFiles(sst_files, &file_metas);
|
|
|
|
|
|
|
|
std::vector<BlobInfo> blob_files = {
|
|
|
|
BlobInfo(103, false /*file_missing*/, "a", "blob1"),
|
|
|
|
BlobInfo(104, true /*file_missing*/, "a", "blob2"),
|
|
|
|
};
|
|
|
|
std::vector<BlobFileAddition> blob_meta;
|
|
|
|
CreateDummyBlobFiles(blob_files, &blob_meta);
|
|
|
|
|
|
|
|
std::vector<std::pair<int, FileMetaData>> added_files;
|
|
|
|
for (size_t i = 0; i < sst_files.size(); i++) {
|
|
|
|
const auto& info = sst_files[i];
|
|
|
|
const auto& meta = file_metas[i];
|
|
|
|
added_files.emplace_back(info.level, meta);
|
|
|
|
}
|
|
|
|
RecoverFromManifestWithMissingFiles(added_files, blob_meta);
|
|
|
|
std::vector<uint64_t> all_table_files;
|
|
|
|
std::vector<uint64_t> all_blob_files;
|
|
|
|
versions_->AddLiveFiles(&all_table_files, &all_blob_files);
|
|
|
|
ASSERT_EQ(2, all_table_files.size());
|
|
|
|
ASSERT_EQ(1, all_blob_files.size());
|
|
|
|
ColumnFamilyData* cfd = versions_->GetColumnFamilySet()->GetDefault();
|
|
|
|
VersionStorageInfo* vstorage = cfd->current()->storage_info();
|
|
|
|
ASSERT_EQ(1, vstorage->LevelFiles(0).size());
|
|
|
|
ASSERT_EQ(1, vstorage->LevelFiles(1).size());
|
|
|
|
ASSERT_EQ(1, vstorage->GetBlobFiles().size());
|
|
|
|
}
|
|
|
|
|
Account memory of FileMetaData in global memory limit (#9924)
Summary:
**Context/Summary:**
As revealed by heap profiling, allocation of `FileMetaData` for [newly created file added to a Version](https://github.com/facebook/rocksdb/pull/9924/files#diff-a6aa385940793f95a2c5b39cc670bd440c4547fa54fd44622f756382d5e47e43R774) can consume significant heap memory. This PR is to account that toward our global memory limit based on block cache capacity.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9924
Test Plan:
- Previous `make check` verified there are only 2 places where the memory of the allocated `FileMetaData` can be released
- New unit test `TEST_P(ChargeFileMetadataTestWithParam, Basic)`
- db bench (CPU cost of `charge_file_metadata` in write and compact)
- **write micros/op: -0.24%** : `TEST_TMPDIR=/dev/shm/testdb ./db_bench -benchmarks=fillseq -db=$TEST_TMPDIR -charge_file_metadata=1 (remove this option for pre-PR) -disable_auto_compactions=1 -write_buffer_size=100000 -num=4000000 | egrep 'fillseq'`
- **compact micros/op -0.87%** : `TEST_TMPDIR=/dev/shm/testdb ./db_bench -benchmarks=fillseq -db=$TEST_TMPDIR -charge_file_metadata=1 -disable_auto_compactions=1 -write_buffer_size=100000 -num=4000000 -numdistinct=1000 && ./db_bench -benchmarks=compact -db=$TEST_TMPDIR -use_existing_db=1 -charge_file_metadata=1 -disable_auto_compactions=1 | egrep 'compact'`
table 1 - write
#-run | (pre-PR) avg micros/op | std micros/op | (post-PR) micros/op | std micros/op | change (%)
-- | -- | -- | -- | -- | --
10 | 3.9711 | 0.264408 | 3.9914 | 0.254563 | 0.5111933721
20 | 3.83905 | 0.0664488 | 3.8251 | 0.0695456 | -0.3633711465
40 | 3.86625 | 0.136669 | 3.8867 | 0.143765 | 0.5289363078
80 | 3.87828 | 0.119007 | 3.86791 | 0.115674 | **-0.2673865734**
160 | 3.87677 | 0.162231 | 3.86739 | 0.16663 | **-0.2419539978**
table 2 - compact
#-run | (pre-PR) avg micros/op | std micros/op | (post-PR) micros/op | std micros/op | change (%)
-- | -- | -- | -- | -- | --
10 | 2,399,650.00 | 96,375.80 | 2,359,537.00 | 53,243.60 | -1.67
20 | 2,410,480.00 | 89,988.00 | 2,433,580.00 | 91,121.20 | 0.96
40 | 2.41E+06 | 121811 | 2.39E+06 | 131525 | **-0.96**
80 | 2.40E+06 | 134503 | 2.39E+06 | 108799 | **-0.78**
- stress test: `python3 tools/db_crashtest.py blackbox --charge_file_metadata=1 --cache_size=1` killed as normal
Reviewed By: ajkr
Differential Revision: D36055583
Pulled By: hx235
fbshipit-source-id: b60eab94707103cb1322cf815f05810ef0232625
2022-06-14 20:06:40 +00:00
|
|
|
class ChargeFileMetadataTest : public DBTestBase {
|
|
|
|
public:
|
|
|
|
ChargeFileMetadataTest()
|
|
|
|
: DBTestBase("charge_file_metadata_test", /*env_do_fsync=*/true) {}
|
|
|
|
};
|
|
|
|
|
|
|
|
class ChargeFileMetadataTestWithParam
|
|
|
|
: public ChargeFileMetadataTest,
|
|
|
|
public testing::WithParamInterface<CacheEntryRoleOptions::Decision> {
|
|
|
|
public:
|
2023-12-04 19:17:32 +00:00
|
|
|
ChargeFileMetadataTestWithParam() = default;
|
Account memory of FileMetaData in global memory limit (#9924)
Summary:
**Context/Summary:**
As revealed by heap profiling, allocation of `FileMetaData` for [newly created file added to a Version](https://github.com/facebook/rocksdb/pull/9924/files#diff-a6aa385940793f95a2c5b39cc670bd440c4547fa54fd44622f756382d5e47e43R774) can consume significant heap memory. This PR is to account that toward our global memory limit based on block cache capacity.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9924
Test Plan:
- Previous `make check` verified there are only 2 places where the memory of the allocated `FileMetaData` can be released
- New unit test `TEST_P(ChargeFileMetadataTestWithParam, Basic)`
- db bench (CPU cost of `charge_file_metadata` in write and compact)
- **write micros/op: -0.24%** : `TEST_TMPDIR=/dev/shm/testdb ./db_bench -benchmarks=fillseq -db=$TEST_TMPDIR -charge_file_metadata=1 (remove this option for pre-PR) -disable_auto_compactions=1 -write_buffer_size=100000 -num=4000000 | egrep 'fillseq'`
- **compact micros/op -0.87%** : `TEST_TMPDIR=/dev/shm/testdb ./db_bench -benchmarks=fillseq -db=$TEST_TMPDIR -charge_file_metadata=1 -disable_auto_compactions=1 -write_buffer_size=100000 -num=4000000 -numdistinct=1000 && ./db_bench -benchmarks=compact -db=$TEST_TMPDIR -use_existing_db=1 -charge_file_metadata=1 -disable_auto_compactions=1 | egrep 'compact'`
table 1 - write
#-run | (pre-PR) avg micros/op | std micros/op | (post-PR) micros/op | std micros/op | change (%)
-- | -- | -- | -- | -- | --
10 | 3.9711 | 0.264408 | 3.9914 | 0.254563 | 0.5111933721
20 | 3.83905 | 0.0664488 | 3.8251 | 0.0695456 | -0.3633711465
40 | 3.86625 | 0.136669 | 3.8867 | 0.143765 | 0.5289363078
80 | 3.87828 | 0.119007 | 3.86791 | 0.115674 | **-0.2673865734**
160 | 3.87677 | 0.162231 | 3.86739 | 0.16663 | **-0.2419539978**
table 2 - compact
#-run | (pre-PR) avg micros/op | std micros/op | (post-PR) micros/op | std micros/op | change (%)
-- | -- | -- | -- | -- | --
10 | 2,399,650.00 | 96,375.80 | 2,359,537.00 | 53,243.60 | -1.67
20 | 2,410,480.00 | 89,988.00 | 2,433,580.00 | 91,121.20 | 0.96
40 | 2.41E+06 | 121811 | 2.39E+06 | 131525 | **-0.96**
80 | 2.40E+06 | 134503 | 2.39E+06 | 108799 | **-0.78**
- stress test: `python3 tools/db_crashtest.py blackbox --charge_file_metadata=1 --cache_size=1` killed as normal
Reviewed By: ajkr
Differential Revision: D36055583
Pulled By: hx235
fbshipit-source-id: b60eab94707103cb1322cf815f05810ef0232625
2022-06-14 20:06:40 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
INSTANTIATE_TEST_CASE_P(
|
|
|
|
ChargeFileMetadataTestWithParam, ChargeFileMetadataTestWithParam,
|
|
|
|
::testing::Values(CacheEntryRoleOptions::Decision::kEnabled,
|
|
|
|
CacheEntryRoleOptions::Decision::kDisabled));
|
|
|
|
|
|
|
|
TEST_P(ChargeFileMetadataTestWithParam, Basic) {
|
|
|
|
Options options;
|
2023-06-16 04:12:39 +00:00
|
|
|
options.level_compaction_dynamic_level_bytes = false;
|
Account memory of FileMetaData in global memory limit (#9924)
Summary:
**Context/Summary:**
As revealed by heap profiling, allocation of `FileMetaData` for [newly created file added to a Version](https://github.com/facebook/rocksdb/pull/9924/files#diff-a6aa385940793f95a2c5b39cc670bd440c4547fa54fd44622f756382d5e47e43R774) can consume significant heap memory. This PR is to account that toward our global memory limit based on block cache capacity.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9924
Test Plan:
- Previous `make check` verified there are only 2 places where the memory of the allocated `FileMetaData` can be released
- New unit test `TEST_P(ChargeFileMetadataTestWithParam, Basic)`
- db bench (CPU cost of `charge_file_metadata` in write and compact)
- **write micros/op: -0.24%** : `TEST_TMPDIR=/dev/shm/testdb ./db_bench -benchmarks=fillseq -db=$TEST_TMPDIR -charge_file_metadata=1 (remove this option for pre-PR) -disable_auto_compactions=1 -write_buffer_size=100000 -num=4000000 | egrep 'fillseq'`
- **compact micros/op -0.87%** : `TEST_TMPDIR=/dev/shm/testdb ./db_bench -benchmarks=fillseq -db=$TEST_TMPDIR -charge_file_metadata=1 -disable_auto_compactions=1 -write_buffer_size=100000 -num=4000000 -numdistinct=1000 && ./db_bench -benchmarks=compact -db=$TEST_TMPDIR -use_existing_db=1 -charge_file_metadata=1 -disable_auto_compactions=1 | egrep 'compact'`
table 1 - write
#-run | (pre-PR) avg micros/op | std micros/op | (post-PR) micros/op | std micros/op | change (%)
-- | -- | -- | -- | -- | --
10 | 3.9711 | 0.264408 | 3.9914 | 0.254563 | 0.5111933721
20 | 3.83905 | 0.0664488 | 3.8251 | 0.0695456 | -0.3633711465
40 | 3.86625 | 0.136669 | 3.8867 | 0.143765 | 0.5289363078
80 | 3.87828 | 0.119007 | 3.86791 | 0.115674 | **-0.2673865734**
160 | 3.87677 | 0.162231 | 3.86739 | 0.16663 | **-0.2419539978**
table 2 - compact
#-run | (pre-PR) avg micros/op | std micros/op | (post-PR) micros/op | std micros/op | change (%)
-- | -- | -- | -- | -- | --
10 | 2,399,650.00 | 96,375.80 | 2,359,537.00 | 53,243.60 | -1.67
20 | 2,410,480.00 | 89,988.00 | 2,433,580.00 | 91,121.20 | 0.96
40 | 2.41E+06 | 121811 | 2.39E+06 | 131525 | **-0.96**
80 | 2.40E+06 | 134503 | 2.39E+06 | 108799 | **-0.78**
- stress test: `python3 tools/db_crashtest.py blackbox --charge_file_metadata=1 --cache_size=1` killed as normal
Reviewed By: ajkr
Differential Revision: D36055583
Pulled By: hx235
fbshipit-source-id: b60eab94707103cb1322cf815f05810ef0232625
2022-06-14 20:06:40 +00:00
|
|
|
BlockBasedTableOptions table_options;
|
|
|
|
CacheEntryRoleOptions::Decision charge_file_metadata = GetParam();
|
|
|
|
table_options.cache_usage_options.options_overrides.insert(
|
|
|
|
{CacheEntryRole::kFileMetadata, {/*.charged = */ charge_file_metadata}});
|
|
|
|
std::shared_ptr<TargetCacheChargeTrackingCache<CacheEntryRole::kFileMetadata>>
|
|
|
|
file_metadata_charge_only_cache = std::make_shared<
|
|
|
|
TargetCacheChargeTrackingCache<CacheEntryRole::kFileMetadata>>(
|
|
|
|
NewLRUCache(
|
|
|
|
4 * CacheReservationManagerImpl<
|
|
|
|
CacheEntryRole::kFileMetadata>::GetDummyEntrySize(),
|
|
|
|
0 /* num_shard_bits */, true /* strict_capacity_limit */));
|
|
|
|
table_options.block_cache = file_metadata_charge_only_cache;
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
options.create_if_missing = true;
|
|
|
|
options.disable_auto_compactions = true;
|
|
|
|
DestroyAndReopen(options);
|
|
|
|
|
|
|
|
// Create 128 file metadata, each of which is roughly 1024 bytes.
|
|
|
|
// This results in 1 *
|
|
|
|
// CacheReservationManagerImpl<CacheEntryRole::kFileMetadata>::GetDummyEntrySize()
|
|
|
|
// cache reservation for file metadata.
|
|
|
|
for (int i = 1; i <= 128; ++i) {
|
|
|
|
ASSERT_OK(Put(std::string(1024, 'a'), "va"));
|
|
|
|
ASSERT_OK(Put("b", "vb"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
if (charge_file_metadata == CacheEntryRoleOptions::Decision::kEnabled) {
|
|
|
|
EXPECT_EQ(file_metadata_charge_only_cache->GetCacheCharge(),
|
|
|
|
1 * CacheReservationManagerImpl<
|
|
|
|
CacheEntryRole::kFileMetadata>::GetDummyEntrySize());
|
|
|
|
|
|
|
|
} else {
|
|
|
|
EXPECT_EQ(file_metadata_charge_only_cache->GetCacheCharge(), 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Create another 128 file metadata.
|
|
|
|
// This increases the file metadata cache reservation to 2 *
|
|
|
|
// CacheReservationManagerImpl<CacheEntryRole::kFileMetadata>::GetDummyEntrySize().
|
|
|
|
for (int i = 1; i <= 128; ++i) {
|
|
|
|
ASSERT_OK(Put(std::string(1024, 'a'), "vva"));
|
|
|
|
ASSERT_OK(Put("b", "vvb"));
|
|
|
|
ASSERT_OK(Flush());
|
|
|
|
}
|
|
|
|
if (charge_file_metadata == CacheEntryRoleOptions::Decision::kEnabled) {
|
|
|
|
EXPECT_EQ(file_metadata_charge_only_cache->GetCacheCharge(),
|
|
|
|
2 * CacheReservationManagerImpl<
|
|
|
|
CacheEntryRole::kFileMetadata>::GetDummyEntrySize());
|
|
|
|
} else {
|
|
|
|
EXPECT_EQ(file_metadata_charge_only_cache->GetCacheCharge(), 0);
|
|
|
|
}
|
|
|
|
// Compaction will create 1 new file metadata, obsolete and delete all 256
|
|
|
|
// file metadata above. This results in 1 *
|
|
|
|
// CacheReservationManagerImpl<CacheEntryRole::kFileMetadata>::GetDummyEntrySize()
|
|
|
|
// cache reservation for file metadata.
|
2022-08-05 19:58:07 +00:00
|
|
|
SyncPoint::GetInstance()->LoadDependency(
|
|
|
|
{{"DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles",
|
|
|
|
"ChargeFileMetadataTestWithParam::"
|
|
|
|
"PreVerifyingCacheReservationRelease"}});
|
|
|
|
SyncPoint::GetInstance()->EnableProcessing();
|
Account memory of FileMetaData in global memory limit (#9924)
Summary:
**Context/Summary:**
As revealed by heap profiling, allocation of `FileMetaData` for [newly created file added to a Version](https://github.com/facebook/rocksdb/pull/9924/files#diff-a6aa385940793f95a2c5b39cc670bd440c4547fa54fd44622f756382d5e47e43R774) can consume significant heap memory. This PR is to account that toward our global memory limit based on block cache capacity.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9924
Test Plan:
- Previous `make check` verified there are only 2 places where the memory of the allocated `FileMetaData` can be released
- New unit test `TEST_P(ChargeFileMetadataTestWithParam, Basic)`
- db bench (CPU cost of `charge_file_metadata` in write and compact)
- **write micros/op: -0.24%** : `TEST_TMPDIR=/dev/shm/testdb ./db_bench -benchmarks=fillseq -db=$TEST_TMPDIR -charge_file_metadata=1 (remove this option for pre-PR) -disable_auto_compactions=1 -write_buffer_size=100000 -num=4000000 | egrep 'fillseq'`
- **compact micros/op -0.87%** : `TEST_TMPDIR=/dev/shm/testdb ./db_bench -benchmarks=fillseq -db=$TEST_TMPDIR -charge_file_metadata=1 -disable_auto_compactions=1 -write_buffer_size=100000 -num=4000000 -numdistinct=1000 && ./db_bench -benchmarks=compact -db=$TEST_TMPDIR -use_existing_db=1 -charge_file_metadata=1 -disable_auto_compactions=1 | egrep 'compact'`
table 1 - write
#-run | (pre-PR) avg micros/op | std micros/op | (post-PR) micros/op | std micros/op | change (%)
-- | -- | -- | -- | -- | --
10 | 3.9711 | 0.264408 | 3.9914 | 0.254563 | 0.5111933721
20 | 3.83905 | 0.0664488 | 3.8251 | 0.0695456 | -0.3633711465
40 | 3.86625 | 0.136669 | 3.8867 | 0.143765 | 0.5289363078
80 | 3.87828 | 0.119007 | 3.86791 | 0.115674 | **-0.2673865734**
160 | 3.87677 | 0.162231 | 3.86739 | 0.16663 | **-0.2419539978**
table 2 - compact
#-run | (pre-PR) avg micros/op | std micros/op | (post-PR) micros/op | std micros/op | change (%)
-- | -- | -- | -- | -- | --
10 | 2,399,650.00 | 96,375.80 | 2,359,537.00 | 53,243.60 | -1.67
20 | 2,410,480.00 | 89,988.00 | 2,433,580.00 | 91,121.20 | 0.96
40 | 2.41E+06 | 121811 | 2.39E+06 | 131525 | **-0.96**
80 | 2.40E+06 | 134503 | 2.39E+06 | 108799 | **-0.78**
- stress test: `python3 tools/db_crashtest.py blackbox --charge_file_metadata=1 --cache_size=1` killed as normal
Reviewed By: ajkr
Differential Revision: D36055583
Pulled By: hx235
fbshipit-source-id: b60eab94707103cb1322cf815f05810ef0232625
2022-06-14 20:06:40 +00:00
|
|
|
ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
|
|
|
|
ASSERT_EQ("0,1", FilesPerLevel(0));
|
2022-08-05 19:58:07 +00:00
|
|
|
TEST_SYNC_POINT(
|
|
|
|
"ChargeFileMetadataTestWithParam::PreVerifyingCacheReservationRelease");
|
Account memory of FileMetaData in global memory limit (#9924)
Summary:
**Context/Summary:**
As revealed by heap profiling, allocation of `FileMetaData` for [newly created file added to a Version](https://github.com/facebook/rocksdb/pull/9924/files#diff-a6aa385940793f95a2c5b39cc670bd440c4547fa54fd44622f756382d5e47e43R774) can consume significant heap memory. This PR is to account that toward our global memory limit based on block cache capacity.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9924
Test Plan:
- Previous `make check` verified there are only 2 places where the memory of the allocated `FileMetaData` can be released
- New unit test `TEST_P(ChargeFileMetadataTestWithParam, Basic)`
- db bench (CPU cost of `charge_file_metadata` in write and compact)
- **write micros/op: -0.24%** : `TEST_TMPDIR=/dev/shm/testdb ./db_bench -benchmarks=fillseq -db=$TEST_TMPDIR -charge_file_metadata=1 (remove this option for pre-PR) -disable_auto_compactions=1 -write_buffer_size=100000 -num=4000000 | egrep 'fillseq'`
- **compact micros/op -0.87%** : `TEST_TMPDIR=/dev/shm/testdb ./db_bench -benchmarks=fillseq -db=$TEST_TMPDIR -charge_file_metadata=1 -disable_auto_compactions=1 -write_buffer_size=100000 -num=4000000 -numdistinct=1000 && ./db_bench -benchmarks=compact -db=$TEST_TMPDIR -use_existing_db=1 -charge_file_metadata=1 -disable_auto_compactions=1 | egrep 'compact'`
table 1 - write
#-run | (pre-PR) avg micros/op | std micros/op | (post-PR) micros/op | std micros/op | change (%)
-- | -- | -- | -- | -- | --
10 | 3.9711 | 0.264408 | 3.9914 | 0.254563 | 0.5111933721
20 | 3.83905 | 0.0664488 | 3.8251 | 0.0695456 | -0.3633711465
40 | 3.86625 | 0.136669 | 3.8867 | 0.143765 | 0.5289363078
80 | 3.87828 | 0.119007 | 3.86791 | 0.115674 | **-0.2673865734**
160 | 3.87677 | 0.162231 | 3.86739 | 0.16663 | **-0.2419539978**
table 2 - compact
#-run | (pre-PR) avg micros/op | std micros/op | (post-PR) micros/op | std micros/op | change (%)
-- | -- | -- | -- | -- | --
10 | 2,399,650.00 | 96,375.80 | 2,359,537.00 | 53,243.60 | -1.67
20 | 2,410,480.00 | 89,988.00 | 2,433,580.00 | 91,121.20 | 0.96
40 | 2.41E+06 | 121811 | 2.39E+06 | 131525 | **-0.96**
80 | 2.40E+06 | 134503 | 2.39E+06 | 108799 | **-0.78**
- stress test: `python3 tools/db_crashtest.py blackbox --charge_file_metadata=1 --cache_size=1` killed as normal
Reviewed By: ajkr
Differential Revision: D36055583
Pulled By: hx235
fbshipit-source-id: b60eab94707103cb1322cf815f05810ef0232625
2022-06-14 20:06:40 +00:00
|
|
|
if (charge_file_metadata == CacheEntryRoleOptions::Decision::kEnabled) {
|
|
|
|
EXPECT_EQ(file_metadata_charge_only_cache->GetCacheCharge(),
|
|
|
|
1 * CacheReservationManagerImpl<
|
|
|
|
CacheEntryRole::kFileMetadata>::GetDummyEntrySize());
|
|
|
|
} else {
|
|
|
|
EXPECT_EQ(file_metadata_charge_only_cache->GetCacheCharge(), 0);
|
|
|
|
}
|
2022-08-05 19:58:07 +00:00
|
|
|
SyncPoint::GetInstance()->DisableProcessing();
|
Account memory of FileMetaData in global memory limit (#9924)
Summary:
**Context/Summary:**
As revealed by heap profiling, allocation of `FileMetaData` for [newly created file added to a Version](https://github.com/facebook/rocksdb/pull/9924/files#diff-a6aa385940793f95a2c5b39cc670bd440c4547fa54fd44622f756382d5e47e43R774) can consume significant heap memory. This PR is to account that toward our global memory limit based on block cache capacity.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9924
Test Plan:
- Previous `make check` verified there are only 2 places where the memory of the allocated `FileMetaData` can be released
- New unit test `TEST_P(ChargeFileMetadataTestWithParam, Basic)`
- db bench (CPU cost of `charge_file_metadata` in write and compact)
- **write micros/op: -0.24%** : `TEST_TMPDIR=/dev/shm/testdb ./db_bench -benchmarks=fillseq -db=$TEST_TMPDIR -charge_file_metadata=1 (remove this option for pre-PR) -disable_auto_compactions=1 -write_buffer_size=100000 -num=4000000 | egrep 'fillseq'`
- **compact micros/op -0.87%** : `TEST_TMPDIR=/dev/shm/testdb ./db_bench -benchmarks=fillseq -db=$TEST_TMPDIR -charge_file_metadata=1 -disable_auto_compactions=1 -write_buffer_size=100000 -num=4000000 -numdistinct=1000 && ./db_bench -benchmarks=compact -db=$TEST_TMPDIR -use_existing_db=1 -charge_file_metadata=1 -disable_auto_compactions=1 | egrep 'compact'`
table 1 - write
#-run | (pre-PR) avg micros/op | std micros/op | (post-PR) micros/op | std micros/op | change (%)
-- | -- | -- | -- | -- | --
10 | 3.9711 | 0.264408 | 3.9914 | 0.254563 | 0.5111933721
20 | 3.83905 | 0.0664488 | 3.8251 | 0.0695456 | -0.3633711465
40 | 3.86625 | 0.136669 | 3.8867 | 0.143765 | 0.5289363078
80 | 3.87828 | 0.119007 | 3.86791 | 0.115674 | **-0.2673865734**
160 | 3.87677 | 0.162231 | 3.86739 | 0.16663 | **-0.2419539978**
table 2 - compact
#-run | (pre-PR) avg micros/op | std micros/op | (post-PR) micros/op | std micros/op | change (%)
-- | -- | -- | -- | -- | --
10 | 2,399,650.00 | 96,375.80 | 2,359,537.00 | 53,243.60 | -1.67
20 | 2,410,480.00 | 89,988.00 | 2,433,580.00 | 91,121.20 | 0.96
40 | 2.41E+06 | 121811 | 2.39E+06 | 131525 | **-0.96**
80 | 2.40E+06 | 134503 | 2.39E+06 | 108799 | **-0.78**
- stress test: `python3 tools/db_crashtest.py blackbox --charge_file_metadata=1 --cache_size=1` killed as normal
Reviewed By: ajkr
Differential Revision: D36055583
Pulled By: hx235
fbshipit-source-id: b60eab94707103cb1322cf815f05810ef0232625
2022-06-14 20:06:40 +00:00
|
|
|
|
|
|
|
// Destroying the db will delete the remaining 1 new file metadata
|
|
|
|
// This results in no cache reservation for file metadata.
|
|
|
|
Destroy(options);
|
|
|
|
EXPECT_EQ(file_metadata_charge_only_cache->GetCacheCharge(),
|
|
|
|
0 * CacheReservationManagerImpl<
|
|
|
|
CacheEntryRole::kFileMetadata>::GetDummyEntrySize());
|
|
|
|
|
|
|
|
// Reopen the db with a smaller cache in order to test failure in allocating
|
|
|
|
// file metadata due to memory limit based on cache capacity
|
|
|
|
file_metadata_charge_only_cache = std::make_shared<
|
|
|
|
TargetCacheChargeTrackingCache<CacheEntryRole::kFileMetadata>>(
|
|
|
|
NewLRUCache(1 * CacheReservationManagerImpl<
|
|
|
|
CacheEntryRole::kFileMetadata>::GetDummyEntrySize(),
|
|
|
|
0 /* num_shard_bits */, true /* strict_capacity_limit */));
|
|
|
|
table_options.block_cache = file_metadata_charge_only_cache;
|
|
|
|
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
|
|
|
|
Reopen(options);
|
|
|
|
ASSERT_OK(Put(std::string(1024, 'a'), "va"));
|
|
|
|
ASSERT_OK(Put("b", "vb"));
|
|
|
|
Status s = Flush();
|
|
|
|
if (charge_file_metadata == CacheEntryRoleOptions::Decision::kEnabled) {
|
|
|
|
EXPECT_TRUE(s.IsMemoryLimit());
|
|
|
|
EXPECT_TRUE(s.ToString().find(
|
|
|
|
kCacheEntryRoleToCamelString[static_cast<std::uint32_t>(
|
|
|
|
CacheEntryRole::kFileMetadata)]) != std::string::npos);
|
|
|
|
EXPECT_TRUE(s.ToString().find("memory limit based on cache capacity") !=
|
|
|
|
std::string::npos);
|
|
|
|
} else {
|
|
|
|
EXPECT_TRUE(s.ok());
|
|
|
|
}
|
|
|
|
}
|
2020-02-20 20:07:53 +00:00
|
|
|
} // namespace ROCKSDB_NAMESPACE
|
2011-06-22 18:45:39 +00:00
|
|
|
|
|
|
|
int main(int argc, char** argv) {
|
2022-10-18 07:35:35 +00:00
|
|
|
ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
|
2015-03-17 21:08:00 +00:00
|
|
|
::testing::InitGoogleTest(&argc, argv);
|
|
|
|
return RUN_ALL_TESTS();
|
2011-06-22 18:45:39 +00:00
|
|
|
}
|