2019-12-09 07:49:32 +00:00
|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
//
|
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
|
Inject more errors to more files in stress test (#12713)
Summary:
**Context:**
We currently have partial error injection:
- DB operation: all read, SST write
- DB open: all read, SST write, all metadata write.
This PR completes the error injection (with some limitations below):
- DB operation & open: all read, all write, all metadata write, all metadata read
**Summary:**
- Inject retryable metadata read, metadata write error concerning directory (e.g, dir sync, ) or file metadata (e.g, name, size, file creation/deletion...)
- Inject retryable errors to all major file types: random access file, sequential file, writable file
- Allow db stress test operations to handle above injected errors gracefully without crashing
- Change all error injection to thread-local implementation for easier disabling and enabling in the same thread. For example, we can control error handling thread to have no error injection. It's also cleaner in code.
- Limitation: compared to before, we now don't have write fault injection for backup/restore CopyOrCreateFiles work threads since they use anonymous background threads as well as read injection for db open bg thread
- Add a new flag to test error recovery without error injection so we can test the path where error recovery actually succeeds
- Some Refactory & fix to db stress test framework (see PR review comments)
- Fix some minor bugs surfaced (see PR review comments)
- Limitation: had to disable backup restore with metadata read/write injection since it surfaces too many testing issues. Will add it back later to focus on surfacing actual code/internal bugs first.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12713
Test Plan:
- Existing UT
- CI with no trivial error failure
Reviewed By: pdillinger
Differential Revision: D58326608
Pulled By: hx235
fbshipit-source-id: 011b5195aaeb6011641ae0a9194f7f2a0e325ad7
2024-06-19 15:42:00 +00:00
|
|
|
#include "db/dbformat.h"
|
|
|
|
#include "db_stress_tool/db_stress_listener.h"
|
|
|
|
#include "db_stress_tool/db_stress_shared_state.h"
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
#include "db_stress_tool/expected_state.h"
|
Inject more errors to more files in stress test (#12713)
Summary:
**Context:**
We currently have partial error injection:
- DB operation: all read, SST write
- DB open: all read, SST write, all metadata write.
This PR completes the error injection (with some limitations below):
- DB operation & open: all read, all write, all metadata write, all metadata read
**Summary:**
- Inject retryable metadata read, metadata write error concerning directory (e.g, dir sync, ) or file metadata (e.g, name, size, file creation/deletion...)
- Inject retryable errors to all major file types: random access file, sequential file, writable file
- Allow db stress test operations to handle above injected errors gracefully without crashing
- Change all error injection to thread-local implementation for easier disabling and enabling in the same thread. For example, we can control error handling thread to have no error injection. It's also cleaner in code.
- Limitation: compared to before, we now don't have write fault injection for backup/restore CopyOrCreateFiles work threads since they use anonymous background threads as well as read injection for db open bg thread
- Add a new flag to test error recovery without error injection so we can test the path where error recovery actually succeeds
- Some Refactory & fix to db stress test framework (see PR review comments)
- Fix some minor bugs surfaced (see PR review comments)
- Limitation: had to disable backup restore with metadata read/write injection since it surfaces too many testing issues. Will add it back later to focus on surfacing actual code/internal bugs first.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12713
Test Plan:
- Existing UT
- CI with no trivial error failure
Reviewed By: pdillinger
Differential Revision: D58326608
Pulled By: hx235
fbshipit-source-id: 011b5195aaeb6011641ae0a9194f7f2a0e325ad7
2024-06-19 15:42:00 +00:00
|
|
|
#include "rocksdb/status.h"
|
2019-12-09 07:49:32 +00:00
|
|
|
#ifdef GFLAGS
|
2023-09-11 23:32:32 +00:00
|
|
|
#include "db/wide/wide_columns_helper.h"
|
2019-12-09 07:49:32 +00:00
|
|
|
#include "db_stress_tool/db_stress_common.h"
|
2022-05-02 20:25:45 +00:00
|
|
|
#include "rocksdb/utilities/transaction_db.h"
|
Account memory of FileMetaData in global memory limit (#9924)
Summary:
**Context/Summary:**
As revealed by heap profiling, allocation of `FileMetaData` for [newly created file added to a Version](https://github.com/facebook/rocksdb/pull/9924/files#diff-a6aa385940793f95a2c5b39cc670bd440c4547fa54fd44622f756382d5e47e43R774) can consume significant heap memory. This PR is to account that toward our global memory limit based on block cache capacity.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9924
Test Plan:
- Previous `make check` verified there are only 2 places where the memory of the allocated `FileMetaData` can be released
- New unit test `TEST_P(ChargeFileMetadataTestWithParam, Basic)`
- db bench (CPU cost of `charge_file_metadata` in write and compact)
- **write micros/op: -0.24%** : `TEST_TMPDIR=/dev/shm/testdb ./db_bench -benchmarks=fillseq -db=$TEST_TMPDIR -charge_file_metadata=1 (remove this option for pre-PR) -disable_auto_compactions=1 -write_buffer_size=100000 -num=4000000 | egrep 'fillseq'`
- **compact micros/op -0.87%** : `TEST_TMPDIR=/dev/shm/testdb ./db_bench -benchmarks=fillseq -db=$TEST_TMPDIR -charge_file_metadata=1 -disable_auto_compactions=1 -write_buffer_size=100000 -num=4000000 -numdistinct=1000 && ./db_bench -benchmarks=compact -db=$TEST_TMPDIR -use_existing_db=1 -charge_file_metadata=1 -disable_auto_compactions=1 | egrep 'compact'`
table 1 - write
#-run | (pre-PR) avg micros/op | std micros/op | (post-PR) micros/op | std micros/op | change (%)
-- | -- | -- | -- | -- | --
10 | 3.9711 | 0.264408 | 3.9914 | 0.254563 | 0.5111933721
20 | 3.83905 | 0.0664488 | 3.8251 | 0.0695456 | -0.3633711465
40 | 3.86625 | 0.136669 | 3.8867 | 0.143765 | 0.5289363078
80 | 3.87828 | 0.119007 | 3.86791 | 0.115674 | **-0.2673865734**
160 | 3.87677 | 0.162231 | 3.86739 | 0.16663 | **-0.2419539978**
table 2 - compact
#-run | (pre-PR) avg micros/op | std micros/op | (post-PR) micros/op | std micros/op | change (%)
-- | -- | -- | -- | -- | --
10 | 2,399,650.00 | 96,375.80 | 2,359,537.00 | 53,243.60 | -1.67
20 | 2,410,480.00 | 89,988.00 | 2,433,580.00 | 91,121.20 | 0.96
40 | 2.41E+06 | 121811 | 2.39E+06 | 131525 | **-0.96**
80 | 2.40E+06 | 134503 | 2.39E+06 | 108799 | **-0.78**
- stress test: `python3 tools/db_crashtest.py blackbox --charge_file_metadata=1 --cache_size=1` killed as normal
Reviewed By: ajkr
Differential Revision: D36055583
Pulled By: hx235
fbshipit-source-id: b60eab94707103cb1322cf815f05810ef0232625
2022-06-14 20:06:40 +00:00
|
|
|
#include "utilities/fault_injection_fs.h"
|
2019-12-09 07:49:32 +00:00
|
|
|
|
2020-02-20 20:07:53 +00:00
|
|
|
namespace ROCKSDB_NAMESPACE {
|
2019-12-09 07:49:32 +00:00
|
|
|
class NonBatchedOpsStressTest : public StressTest {
|
|
|
|
public:
|
2023-12-04 19:17:32 +00:00
|
|
|
NonBatchedOpsStressTest() = default;
|
2019-12-09 07:49:32 +00:00
|
|
|
|
2023-12-04 19:17:32 +00:00
|
|
|
virtual ~NonBatchedOpsStressTest() = default;
|
2019-12-09 07:49:32 +00:00
|
|
|
|
2019-12-09 22:36:10 +00:00
|
|
|
void VerifyDb(ThreadState* thread) const override {
|
2022-02-17 07:17:03 +00:00
|
|
|
// This `ReadOptions` is for validation purposes. Ignore
|
|
|
|
// `FLAGS_rate_limit_user_ops` to avoid slowing any validation.
|
2019-12-09 07:49:32 +00:00
|
|
|
ReadOptions options(FLAGS_verify_checksum, true);
|
Add user-defined timestamps to db_stress (#8061)
Summary:
Add some basic test for user-defined timestamp to db_stress. Currently,
read with timestamp always tries to read using the current timestamp.
Due to the per-key timestamp-sequence ordering constraint, we only add timestamp-
related tests to the `NonBatchedOpsStressTest` since this test serializes accesses
to the same key and uses a file to cross-check data correctness.
The timestamp feature is not supported in a number of components, e.g. Merge, SingleDelete,
DeleteRange, CompactionFilter, Readonly instance, secondary instance, SST file ingestion, transaction,
etc. Therefore, db_stress should exit if user enables both timestamp and these features at the same
time. The (currently) incompatible features can be found in
`CheckAndSetOptionsForUserTimestamp`.
This PR also fixes a bug triggered when timestamp is enabled together with
`index_type=kBinarySearchWithFirstKey`. This bug fix will also be in another separate PR
with more unit tests coverage. Fixing it here because I do not want to exclude the index type
from crash test.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8061
Test Plan: make crash_test_with_ts
Reviewed By: jay-zhuang
Differential Revision: D27056282
Pulled By: riversand963
fbshipit-source-id: c3e00ad1023fdb9ebbdf9601ec18270c5e2925a9
2021-03-23 12:12:04 +00:00
|
|
|
std::string ts_str;
|
|
|
|
Slice ts;
|
|
|
|
if (FLAGS_user_timestamp_size > 0) {
|
2022-07-05 20:30:15 +00:00
|
|
|
ts_str = GetNowNanos();
|
Add user-defined timestamps to db_stress (#8061)
Summary:
Add some basic test for user-defined timestamp to db_stress. Currently,
read with timestamp always tries to read using the current timestamp.
Due to the per-key timestamp-sequence ordering constraint, we only add timestamp-
related tests to the `NonBatchedOpsStressTest` since this test serializes accesses
to the same key and uses a file to cross-check data correctness.
The timestamp feature is not supported in a number of components, e.g. Merge, SingleDelete,
DeleteRange, CompactionFilter, Readonly instance, secondary instance, SST file ingestion, transaction,
etc. Therefore, db_stress should exit if user enables both timestamp and these features at the same
time. The (currently) incompatible features can be found in
`CheckAndSetOptionsForUserTimestamp`.
This PR also fixes a bug triggered when timestamp is enabled together with
`index_type=kBinarySearchWithFirstKey`. This bug fix will also be in another separate PR
with more unit tests coverage. Fixing it here because I do not want to exclude the index type
from crash test.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8061
Test Plan: make crash_test_with_ts
Reviewed By: jay-zhuang
Differential Revision: D27056282
Pulled By: riversand963
fbshipit-source-id: c3e00ad1023fdb9ebbdf9601ec18270c5e2925a9
2021-03-23 12:12:04 +00:00
|
|
|
ts = ts_str;
|
|
|
|
options.timestamp = &ts;
|
|
|
|
}
|
2022-09-26 22:33:36 +00:00
|
|
|
|
2019-12-09 07:49:32 +00:00
|
|
|
auto shared = thread->shared;
|
|
|
|
const int64_t max_key = shared->GetMaxKey();
|
|
|
|
const int64_t keys_per_thread = max_key / shared->GetNumThreads();
|
|
|
|
int64_t start = keys_per_thread * thread->tid;
|
|
|
|
int64_t end = start + keys_per_thread;
|
|
|
|
uint64_t prefix_to_use =
|
|
|
|
(FLAGS_prefix_size < 0) ? 1 : static_cast<size_t>(FLAGS_prefix_size);
|
2022-09-26 22:33:36 +00:00
|
|
|
|
2019-12-09 07:49:32 +00:00
|
|
|
if (thread->tid == shared->GetNumThreads() - 1) {
|
|
|
|
end = max_key;
|
|
|
|
}
|
2022-09-26 22:33:36 +00:00
|
|
|
|
2019-12-09 07:49:32 +00:00
|
|
|
for (size_t cf = 0; cf < column_families_.size(); ++cf) {
|
|
|
|
if (thread->shared->HasVerificationFailedYet()) {
|
|
|
|
break;
|
|
|
|
}
|
2022-09-26 22:33:36 +00:00
|
|
|
|
|
|
|
enum class VerificationMethod {
|
|
|
|
kIterator,
|
|
|
|
kGet,
|
2023-01-31 18:17:48 +00:00
|
|
|
kGetEntity,
|
2022-09-26 22:33:36 +00:00
|
|
|
kMultiGet,
|
2023-02-16 01:08:25 +00:00
|
|
|
kMultiGetEntity,
|
2022-09-26 22:33:36 +00:00
|
|
|
kGetMergeOperands,
|
|
|
|
// Add any new items above kNumberOfMethods
|
|
|
|
kNumberOfMethods
|
|
|
|
};
|
|
|
|
|
Add the PutEntity API to the stress/crash tests (#10760)
Summary:
The patch adds the `PutEntity` API to the non-batched, batched, and
CF consistency stress tests. Namely, when the new `db_stress` command
line parameter `use_put_entity_one_in` is greater than zero, one in
N writes on average is performed using `PutEntity` rather than `Put`.
The wide-column entity written has the generated value in its default
column; in addition, it contains up to three additional columns where
the original generated value is divided up between the column name and the
column value (with the column name containing the first k characters of
the generated value, and the column value containing the rest). Whether
`PutEntity` is used (and if so, how many columns the entity has) is completely
determined by the "value base" used to generate the value (that is, there is
no randomness involved). Assuming the same `use_put_entity_one_in` setting
is used across `db_stress` invocations, this enables us to reconstruct and
validate the entity during subsequent `db_stress` runs.
Note that `PutEntity` is currently incompatible with `Merge`, transactions, and
user-defined timestamps; these combinations are currently disabled/disallowed.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10760
Test Plan: Ran some batched, non-batched, and CF consistency stress tests using the script.
Reviewed By: riversand963
Differential Revision: D39939032
Pulled By: ltamasi
fbshipit-source-id: eafdf124e95993fb7d73158e3b006d11819f7fa9
2022-09-30 18:11:07 +00:00
|
|
|
constexpr int num_methods =
|
2022-09-26 22:33:36 +00:00
|
|
|
static_cast<int>(VerificationMethod::kNumberOfMethods);
|
Add the PutEntity API to the stress/crash tests (#10760)
Summary:
The patch adds the `PutEntity` API to the non-batched, batched, and
CF consistency stress tests. Namely, when the new `db_stress` command
line parameter `use_put_entity_one_in` is greater than zero, one in
N writes on average is performed using `PutEntity` rather than `Put`.
The wide-column entity written has the generated value in its default
column; in addition, it contains up to three additional columns where
the original generated value is divided up between the column name and the
column value (with the column name containing the first k characters of
the generated value, and the column value containing the rest). Whether
`PutEntity` is used (and if so, how many columns the entity has) is completely
determined by the "value base" used to generate the value (that is, there is
no randomness involved). Assuming the same `use_put_entity_one_in` setting
is used across `db_stress` invocations, this enables us to reconstruct and
validate the entity during subsequent `db_stress` runs.
Note that `PutEntity` is currently incompatible with `Merge`, transactions, and
user-defined timestamps; these combinations are currently disabled/disallowed.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10760
Test Plan: Ran some batched, non-batched, and CF consistency stress tests using the script.
Reviewed By: riversand963
Differential Revision: D39939032
Pulled By: ltamasi
fbshipit-source-id: eafdf124e95993fb7d73158e3b006d11819f7fa9
2022-09-30 18:11:07 +00:00
|
|
|
|
2022-09-26 22:33:36 +00:00
|
|
|
const VerificationMethod method =
|
2022-11-18 04:43:50 +00:00
|
|
|
static_cast<VerificationMethod>(thread->rand.Uniform(
|
|
|
|
(FLAGS_user_timestamp_size > 0) ? num_methods - 1 : num_methods));
|
2022-09-26 22:33:36 +00:00
|
|
|
|
|
|
|
if (method == VerificationMethod::kIterator) {
|
2019-12-09 07:49:32 +00:00
|
|
|
std::unique_ptr<Iterator> iter(
|
|
|
|
db_->NewIterator(options, column_families_[cf]));
|
2022-09-26 22:33:36 +00:00
|
|
|
|
|
|
|
std::string seek_key = Key(start);
|
2020-01-10 05:25:40 +00:00
|
|
|
iter->Seek(seek_key);
|
2022-09-26 22:33:36 +00:00
|
|
|
|
|
|
|
Slice prefix(seek_key.data(), prefix_to_use);
|
|
|
|
|
|
|
|
for (int64_t i = start; i < end; ++i) {
|
2019-12-09 07:49:32 +00:00
|
|
|
if (thread->shared->HasVerificationFailedYet()) {
|
|
|
|
break;
|
|
|
|
}
|
2022-09-26 22:33:36 +00:00
|
|
|
|
|
|
|
const std::string key = Key(i);
|
|
|
|
const Slice k(key);
|
|
|
|
const Slice pfx(key.data(), prefix_to_use);
|
|
|
|
|
2020-01-10 05:25:40 +00:00
|
|
|
// Reseek when the prefix changes
|
|
|
|
if (prefix_to_use > 0 && prefix.compare(pfx) != 0) {
|
|
|
|
iter->Seek(k);
|
2022-09-26 22:33:36 +00:00
|
|
|
seek_key = key;
|
2020-01-10 05:25:40 +00:00
|
|
|
prefix = Slice(seek_key.data(), prefix_to_use);
|
|
|
|
}
|
2022-09-26 22:33:36 +00:00
|
|
|
|
2019-12-09 07:49:32 +00:00
|
|
|
Status s = iter->status();
|
Add the PutEntity API to the stress/crash tests (#10760)
Summary:
The patch adds the `PutEntity` API to the non-batched, batched, and
CF consistency stress tests. Namely, when the new `db_stress` command
line parameter `use_put_entity_one_in` is greater than zero, one in
N writes on average is performed using `PutEntity` rather than `Put`.
The wide-column entity written has the generated value in its default
column; in addition, it contains up to three additional columns where
the original generated value is divided up between the column name and the
column value (with the column name containing the first k characters of
the generated value, and the column value containing the rest). Whether
`PutEntity` is used (and if so, how many columns the entity has) is completely
determined by the "value base" used to generate the value (that is, there is
no randomness involved). Assuming the same `use_put_entity_one_in` setting
is used across `db_stress` invocations, this enables us to reconstruct and
validate the entity during subsequent `db_stress` runs.
Note that `PutEntity` is currently incompatible with `Merge`, transactions, and
user-defined timestamps; these combinations are currently disabled/disallowed.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10760
Test Plan: Ran some batched, non-batched, and CF consistency stress tests using the script.
Reviewed By: riversand963
Differential Revision: D39939032
Pulled By: ltamasi
fbshipit-source-id: eafdf124e95993fb7d73158e3b006d11819f7fa9
2022-09-30 18:11:07 +00:00
|
|
|
|
2022-09-26 22:33:36 +00:00
|
|
|
std::string from_db;
|
|
|
|
|
2019-12-09 07:49:32 +00:00
|
|
|
if (iter->Valid()) {
|
Add the PutEntity API to the stress/crash tests (#10760)
Summary:
The patch adds the `PutEntity` API to the non-batched, batched, and
CF consistency stress tests. Namely, when the new `db_stress` command
line parameter `use_put_entity_one_in` is greater than zero, one in
N writes on average is performed using `PutEntity` rather than `Put`.
The wide-column entity written has the generated value in its default
column; in addition, it contains up to three additional columns where
the original generated value is divided up between the column name and the
column value (with the column name containing the first k characters of
the generated value, and the column value containing the rest). Whether
`PutEntity` is used (and if so, how many columns the entity has) is completely
determined by the "value base" used to generate the value (that is, there is
no randomness involved). Assuming the same `use_put_entity_one_in` setting
is used across `db_stress` invocations, this enables us to reconstruct and
validate the entity during subsequent `db_stress` runs.
Note that `PutEntity` is currently incompatible with `Merge`, transactions, and
user-defined timestamps; these combinations are currently disabled/disallowed.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10760
Test Plan: Ran some batched, non-batched, and CF consistency stress tests using the script.
Reviewed By: riversand963
Differential Revision: D39939032
Pulled By: ltamasi
fbshipit-source-id: eafdf124e95993fb7d73158e3b006d11819f7fa9
2022-09-30 18:11:07 +00:00
|
|
|
const int diff = iter->key().compare(k);
|
2022-09-26 22:33:36 +00:00
|
|
|
|
|
|
|
if (diff > 0) {
|
|
|
|
s = Status::NotFound();
|
|
|
|
} else if (diff == 0) {
|
2023-03-17 21:47:29 +00:00
|
|
|
if (!VerifyWideColumns(iter->value(), iter->columns())) {
|
2022-10-06 22:07:16 +00:00
|
|
|
VerificationAbort(shared, static_cast<int>(cf), i,
|
2023-03-17 21:47:29 +00:00
|
|
|
iter->value(), iter->columns());
|
2022-10-06 22:07:16 +00:00
|
|
|
}
|
|
|
|
|
2019-12-09 07:49:32 +00:00
|
|
|
from_db = iter->value().ToString();
|
|
|
|
iter->Next();
|
2022-09-26 22:33:36 +00:00
|
|
|
} else {
|
|
|
|
assert(diff < 0);
|
|
|
|
|
2019-12-09 07:49:32 +00:00
|
|
|
VerificationAbort(shared, "An out of range key was found",
|
|
|
|
static_cast<int>(cf), i);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// The iterator found no value for the key in question, so do not
|
|
|
|
// move to the next item in the iterator
|
2019-12-20 16:46:52 +00:00
|
|
|
s = Status::NotFound();
|
2019-12-09 07:49:32 +00:00
|
|
|
}
|
2022-09-26 22:33:36 +00:00
|
|
|
|
2022-04-27 13:01:09 +00:00
|
|
|
VerifyOrSyncValue(static_cast<int>(cf), i, options, shared, from_db,
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
/* msg_prefix */ "Iterator verification", s);
|
2022-09-26 22:33:36 +00:00
|
|
|
|
|
|
|
if (!from_db.empty()) {
|
2019-12-09 07:49:32 +00:00
|
|
|
PrintKeyValue(static_cast<int>(cf), static_cast<uint32_t>(i),
|
2022-09-26 22:33:36 +00:00
|
|
|
from_db.data(), from_db.size());
|
2019-12-09 07:49:32 +00:00
|
|
|
}
|
|
|
|
}
|
2022-09-26 22:33:36 +00:00
|
|
|
} else if (method == VerificationMethod::kGet) {
|
|
|
|
for (int64_t i = start; i < end; ++i) {
|
2019-12-09 07:49:32 +00:00
|
|
|
if (thread->shared->HasVerificationFailedYet()) {
|
|
|
|
break;
|
|
|
|
}
|
2022-09-26 22:33:36 +00:00
|
|
|
|
|
|
|
const std::string key = Key(i);
|
2019-12-09 07:49:32 +00:00
|
|
|
std::string from_db;
|
2022-09-26 22:33:36 +00:00
|
|
|
|
|
|
|
Status s = db_->Get(options, column_families_[cf], key, &from_db);
|
|
|
|
|
2022-04-27 13:01:09 +00:00
|
|
|
VerifyOrSyncValue(static_cast<int>(cf), i, options, shared, from_db,
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
/* msg_prefix */ "Get verification", s);
|
2022-09-26 22:33:36 +00:00
|
|
|
|
2023-01-31 18:17:48 +00:00
|
|
|
if (!from_db.empty()) {
|
|
|
|
PrintKeyValue(static_cast<int>(cf), static_cast<uint32_t>(i),
|
|
|
|
from_db.data(), from_db.size());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else if (method == VerificationMethod::kGetEntity) {
|
|
|
|
for (int64_t i = start; i < end; ++i) {
|
|
|
|
if (thread->shared->HasVerificationFailedYet()) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
const std::string key = Key(i);
|
2023-03-17 21:47:29 +00:00
|
|
|
PinnableWideColumns result;
|
2023-01-31 18:17:48 +00:00
|
|
|
|
|
|
|
Status s =
|
2023-03-17 21:47:29 +00:00
|
|
|
db_->GetEntity(options, column_families_[cf], key, &result);
|
2023-01-31 18:17:48 +00:00
|
|
|
|
|
|
|
std::string from_db;
|
|
|
|
|
|
|
|
if (s.ok()) {
|
2023-03-17 21:47:29 +00:00
|
|
|
const WideColumns& columns = result.columns();
|
2023-02-16 01:08:25 +00:00
|
|
|
|
2023-09-11 23:32:32 +00:00
|
|
|
if (WideColumnsHelper::HasDefaultColumn(columns)) {
|
|
|
|
from_db = WideColumnsHelper::GetDefaultColumn(columns).ToString();
|
2023-01-31 18:17:48 +00:00
|
|
|
}
|
|
|
|
|
2023-03-17 21:47:29 +00:00
|
|
|
if (!VerifyWideColumns(columns)) {
|
2023-01-31 18:17:48 +00:00
|
|
|
VerificationAbort(shared, static_cast<int>(cf), i, from_db,
|
2023-03-17 21:47:29 +00:00
|
|
|
columns);
|
2023-01-31 18:17:48 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
VerifyOrSyncValue(static_cast<int>(cf), i, options, shared, from_db,
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
/* msg_prefix */ "GetEntity verification", s);
|
2023-01-31 18:17:48 +00:00
|
|
|
|
2022-09-26 22:33:36 +00:00
|
|
|
if (!from_db.empty()) {
|
2019-12-09 07:49:32 +00:00
|
|
|
PrintKeyValue(static_cast<int>(cf), static_cast<uint32_t>(i),
|
2022-09-26 22:33:36 +00:00
|
|
|
from_db.data(), from_db.size());
|
2019-12-09 07:49:32 +00:00
|
|
|
}
|
|
|
|
}
|
2022-09-26 22:33:36 +00:00
|
|
|
} else if (method == VerificationMethod::kMultiGet) {
|
|
|
|
for (int64_t i = start; i < end;) {
|
2020-05-19 19:41:43 +00:00
|
|
|
if (thread->shared->HasVerificationFailedYet()) {
|
|
|
|
break;
|
|
|
|
}
|
2022-09-26 22:33:36 +00:00
|
|
|
|
2020-05-19 19:41:43 +00:00
|
|
|
// Keep the batch size to some reasonable value
|
|
|
|
size_t batch_size = thread->rand.Uniform(128) + 1;
|
|
|
|
batch_size = std::min<size_t>(batch_size, end - i);
|
2022-09-26 22:33:36 +00:00
|
|
|
|
2023-02-16 01:08:25 +00:00
|
|
|
std::vector<std::string> key_strs(batch_size);
|
2020-05-19 19:41:43 +00:00
|
|
|
std::vector<Slice> keys(batch_size);
|
|
|
|
std::vector<PinnableSlice> values(batch_size);
|
|
|
|
std::vector<Status> statuses(batch_size);
|
2022-09-26 22:33:36 +00:00
|
|
|
|
2020-05-19 19:41:43 +00:00
|
|
|
for (size_t j = 0; j < batch_size; ++j) {
|
2023-02-16 01:08:25 +00:00
|
|
|
key_strs[j] = Key(i + j);
|
|
|
|
keys[j] = Slice(key_strs[j]);
|
2020-05-19 19:41:43 +00:00
|
|
|
}
|
2022-09-26 22:33:36 +00:00
|
|
|
|
2020-05-19 19:41:43 +00:00
|
|
|
db_->MultiGet(options, column_families_[cf], batch_size, keys.data(),
|
|
|
|
values.data(), statuses.data());
|
2022-09-26 22:33:36 +00:00
|
|
|
|
2020-05-19 19:41:43 +00:00
|
|
|
for (size_t j = 0; j < batch_size; ++j) {
|
2022-09-26 22:33:36 +00:00
|
|
|
const std::string from_db = values[j].ToString();
|
|
|
|
|
2022-04-27 13:01:09 +00:00
|
|
|
VerifyOrSyncValue(static_cast<int>(cf), i + j, options, shared,
|
2023-01-27 15:45:25 +00:00
|
|
|
from_db, /* msg_prefix */ "MultiGet verification",
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
statuses[j]);
|
2022-09-26 22:33:36 +00:00
|
|
|
|
|
|
|
if (!from_db.empty()) {
|
2020-05-19 19:41:43 +00:00
|
|
|
PrintKeyValue(static_cast<int>(cf), static_cast<uint32_t>(i + j),
|
2022-09-26 22:33:36 +00:00
|
|
|
from_db.data(), from_db.size());
|
2020-05-19 19:41:43 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-02-16 01:08:25 +00:00
|
|
|
i += batch_size;
|
|
|
|
}
|
|
|
|
} else if (method == VerificationMethod::kMultiGetEntity) {
|
|
|
|
for (int64_t i = start; i < end;) {
|
|
|
|
if (thread->shared->HasVerificationFailedYet()) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Keep the batch size to some reasonable value
|
|
|
|
size_t batch_size = thread->rand.Uniform(128) + 1;
|
|
|
|
batch_size = std::min<size_t>(batch_size, end - i);
|
|
|
|
|
|
|
|
std::vector<std::string> key_strs(batch_size);
|
|
|
|
std::vector<Slice> keys(batch_size);
|
|
|
|
std::vector<PinnableWideColumns> results(batch_size);
|
|
|
|
std::vector<Status> statuses(batch_size);
|
|
|
|
|
|
|
|
for (size_t j = 0; j < batch_size; ++j) {
|
|
|
|
key_strs[j] = Key(i + j);
|
|
|
|
keys[j] = Slice(key_strs[j]);
|
|
|
|
}
|
|
|
|
|
|
|
|
db_->MultiGetEntity(options, column_families_[cf], batch_size,
|
|
|
|
keys.data(), results.data(), statuses.data());
|
|
|
|
|
|
|
|
for (size_t j = 0; j < batch_size; ++j) {
|
|
|
|
std::string from_db;
|
|
|
|
|
|
|
|
if (statuses[j].ok()) {
|
2023-03-17 21:47:29 +00:00
|
|
|
const WideColumns& columns = results[j].columns();
|
2023-02-16 01:08:25 +00:00
|
|
|
|
2023-09-11 23:32:32 +00:00
|
|
|
if (WideColumnsHelper::HasDefaultColumn(columns)) {
|
|
|
|
from_db =
|
|
|
|
WideColumnsHelper::GetDefaultColumn(columns).ToString();
|
2023-02-16 01:08:25 +00:00
|
|
|
}
|
|
|
|
|
2023-03-17 21:47:29 +00:00
|
|
|
if (!VerifyWideColumns(columns)) {
|
2023-02-16 01:08:25 +00:00
|
|
|
VerificationAbort(shared, static_cast<int>(cf), i, from_db,
|
2023-03-17 21:47:29 +00:00
|
|
|
columns);
|
2023-02-16 01:08:25 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
VerifyOrSyncValue(
|
|
|
|
static_cast<int>(cf), i + j, options, shared, from_db,
|
|
|
|
/* msg_prefix */ "MultiGetEntity verification", statuses[j]);
|
2023-02-16 01:08:25 +00:00
|
|
|
|
|
|
|
if (!from_db.empty()) {
|
|
|
|
PrintKeyValue(static_cast<int>(cf), static_cast<uint32_t>(i + j),
|
|
|
|
from_db.data(), from_db.size());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-05-19 19:41:43 +00:00
|
|
|
i += batch_size;
|
|
|
|
}
|
2022-04-05 21:56:28 +00:00
|
|
|
} else {
|
2022-09-26 22:33:36 +00:00
|
|
|
assert(method == VerificationMethod::kGetMergeOperands);
|
|
|
|
|
2022-04-05 21:56:28 +00:00
|
|
|
// Start off with small size that will be increased later if necessary
|
|
|
|
std::vector<PinnableSlice> values(4);
|
2022-09-26 22:33:36 +00:00
|
|
|
|
2022-04-05 21:56:28 +00:00
|
|
|
GetMergeOperandsOptions merge_operands_info;
|
|
|
|
merge_operands_info.expected_max_number_of_operands =
|
|
|
|
static_cast<int>(values.size());
|
2022-09-26 22:33:36 +00:00
|
|
|
|
|
|
|
for (int64_t i = start; i < end; ++i) {
|
2022-04-05 21:56:28 +00:00
|
|
|
if (thread->shared->HasVerificationFailedYet()) {
|
|
|
|
break;
|
|
|
|
}
|
2022-09-26 22:33:36 +00:00
|
|
|
|
|
|
|
const std::string key = Key(i);
|
|
|
|
const Slice k(key);
|
2022-04-05 21:56:28 +00:00
|
|
|
std::string from_db;
|
|
|
|
int number_of_operands = 0;
|
2022-09-26 22:33:36 +00:00
|
|
|
|
2022-04-05 21:56:28 +00:00
|
|
|
Status s = db_->GetMergeOperands(options, column_families_[cf], k,
|
|
|
|
values.data(), &merge_operands_info,
|
|
|
|
&number_of_operands);
|
2022-09-26 22:33:36 +00:00
|
|
|
|
2022-04-05 21:56:28 +00:00
|
|
|
if (s.IsIncomplete()) {
|
|
|
|
// Need to resize values as there are more than values.size() merge
|
|
|
|
// operands on this key. Should only happen a few times when we
|
|
|
|
// encounter a key that had more merge operands than any key seen so
|
|
|
|
// far
|
|
|
|
values.resize(number_of_operands);
|
|
|
|
merge_operands_info.expected_max_number_of_operands =
|
|
|
|
static_cast<int>(number_of_operands);
|
|
|
|
s = db_->GetMergeOperands(options, column_families_[cf], k,
|
|
|
|
values.data(), &merge_operands_info,
|
|
|
|
&number_of_operands);
|
|
|
|
}
|
|
|
|
// Assumed here that GetMergeOperands always sets number_of_operand
|
|
|
|
if (number_of_operands) {
|
|
|
|
from_db = values[number_of_operands - 1].ToString();
|
|
|
|
}
|
2022-09-26 22:33:36 +00:00
|
|
|
|
2022-04-27 13:01:09 +00:00
|
|
|
VerifyOrSyncValue(static_cast<int>(cf), i, options, shared, from_db,
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
/* msg_prefix */ "GetMergeOperands verification",
|
|
|
|
s);
|
2022-09-26 22:33:36 +00:00
|
|
|
|
|
|
|
if (!from_db.empty()) {
|
2022-04-05 21:56:28 +00:00
|
|
|
PrintKeyValue(static_cast<int>(cf), static_cast<uint32_t>(i),
|
2022-09-26 22:33:36 +00:00
|
|
|
from_db.data(), from_db.size());
|
2022-04-05 21:56:28 +00:00
|
|
|
}
|
|
|
|
}
|
2019-12-09 07:49:32 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-06-08 04:07:47 +00:00
|
|
|
void ContinuouslyVerifyDb(ThreadState* thread) const override {
|
|
|
|
if (!cmp_db_) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
assert(cmp_db_);
|
|
|
|
assert(!cmp_cfhs_.empty());
|
|
|
|
Status s = cmp_db_->TryCatchUpWithPrimary();
|
|
|
|
if (!s.ok()) {
|
|
|
|
assert(false);
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
|
|
|
|
const auto checksum_column_family = [](Iterator* iter,
|
|
|
|
uint32_t* checksum) -> Status {
|
|
|
|
assert(nullptr != checksum);
|
|
|
|
uint32_t ret = 0;
|
|
|
|
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
|
|
|
ret = crc32c::Extend(ret, iter->key().data(), iter->key().size());
|
|
|
|
ret = crc32c::Extend(ret, iter->value().data(), iter->value().size());
|
|
|
|
}
|
|
|
|
*checksum = ret;
|
|
|
|
return iter->status();
|
|
|
|
};
|
|
|
|
|
|
|
|
auto* shared = thread->shared;
|
|
|
|
assert(shared);
|
|
|
|
const int64_t max_key = shared->GetMaxKey();
|
|
|
|
ReadOptions read_opts(FLAGS_verify_checksum, true);
|
|
|
|
std::string ts_str;
|
|
|
|
Slice ts;
|
|
|
|
if (FLAGS_user_timestamp_size > 0) {
|
2022-07-05 20:30:15 +00:00
|
|
|
ts_str = GetNowNanos();
|
2022-06-08 04:07:47 +00:00
|
|
|
ts = ts_str;
|
|
|
|
read_opts.timestamp = &ts;
|
|
|
|
}
|
|
|
|
|
|
|
|
static Random64 rand64(shared->GetSeed());
|
|
|
|
|
|
|
|
{
|
|
|
|
uint32_t crc = 0;
|
|
|
|
std::unique_ptr<Iterator> it(cmp_db_->NewIterator(read_opts));
|
|
|
|
s = checksum_column_family(it.get(), &crc);
|
|
|
|
if (!s.ok()) {
|
|
|
|
fprintf(stderr, "Computing checksum of default cf: %s\n",
|
|
|
|
s.ToString().c_str());
|
|
|
|
assert(false);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
for (auto* handle : cmp_cfhs_) {
|
|
|
|
if (thread->rand.OneInOpt(3)) {
|
|
|
|
// Use Get()
|
|
|
|
uint64_t key = rand64.Uniform(static_cast<uint64_t>(max_key));
|
|
|
|
std::string key_str = Key(key);
|
|
|
|
std::string value;
|
|
|
|
std::string key_ts;
|
|
|
|
s = cmp_db_->Get(read_opts, handle, key_str, &value,
|
|
|
|
FLAGS_user_timestamp_size > 0 ? &key_ts : nullptr);
|
|
|
|
s.PermitUncheckedError();
|
|
|
|
} else {
|
|
|
|
// Use range scan
|
|
|
|
std::unique_ptr<Iterator> iter(cmp_db_->NewIterator(read_opts, handle));
|
|
|
|
uint32_t rnd = (thread->rand.Next()) % 4;
|
|
|
|
if (0 == rnd) {
|
|
|
|
// SeekToFirst() + Next()*5
|
|
|
|
read_opts.total_order_seek = true;
|
|
|
|
iter->SeekToFirst();
|
|
|
|
for (int i = 0; i < 5 && iter->Valid(); ++i, iter->Next()) {
|
|
|
|
}
|
|
|
|
} else if (1 == rnd) {
|
|
|
|
// SeekToLast() + Prev()*5
|
|
|
|
read_opts.total_order_seek = true;
|
|
|
|
iter->SeekToLast();
|
|
|
|
for (int i = 0; i < 5 && iter->Valid(); ++i, iter->Prev()) {
|
|
|
|
}
|
|
|
|
} else if (2 == rnd) {
|
|
|
|
// Seek() +Next()*5
|
|
|
|
uint64_t key = rand64.Uniform(static_cast<uint64_t>(max_key));
|
|
|
|
std::string key_str = Key(key);
|
|
|
|
iter->Seek(key_str);
|
|
|
|
for (int i = 0; i < 5 && iter->Valid(); ++i, iter->Next()) {
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// SeekForPrev() + Prev()*5
|
|
|
|
uint64_t key = rand64.Uniform(static_cast<uint64_t>(max_key));
|
|
|
|
std::string key_str = Key(key);
|
|
|
|
iter->SeekForPrev(key_str);
|
|
|
|
for (int i = 0; i < 5 && iter->Valid(); ++i, iter->Prev()) {
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-12-09 22:36:10 +00:00
|
|
|
void MaybeClearOneColumnFamily(ThreadState* thread) override {
|
2019-12-13 21:25:14 +00:00
|
|
|
if (FLAGS_column_families > 1) {
|
|
|
|
if (thread->rand.OneInOpt(FLAGS_clear_column_family_one_in)) {
|
2019-12-09 07:49:32 +00:00
|
|
|
// drop column family and then create it again (can't drop default)
|
|
|
|
int cf = thread->rand.Next() % (FLAGS_column_families - 1) + 1;
|
2022-05-06 20:03:58 +00:00
|
|
|
std::string new_name =
|
|
|
|
std::to_string(new_column_family_name_.fetch_add(1));
|
2019-12-09 07:49:32 +00:00
|
|
|
{
|
|
|
|
MutexLock l(thread->shared->GetMutex());
|
|
|
|
fprintf(
|
|
|
|
stdout,
|
|
|
|
"[CF %d] Dropping and recreating column family. new name: %s\n",
|
|
|
|
cf, new_name.c_str());
|
|
|
|
}
|
|
|
|
thread->shared->LockColumnFamily(cf);
|
|
|
|
Status s = db_->DropColumnFamily(column_families_[cf]);
|
|
|
|
delete column_families_[cf];
|
|
|
|
if (!s.ok()) {
|
|
|
|
fprintf(stderr, "dropping column family error: %s\n",
|
|
|
|
s.ToString().c_str());
|
Allow TryAgain in db_stress with optimistic txn, and refactoring (#11653)
Summary:
In rare cases, optimistic transaction commit returns TryAgain. This change tolerates that intentional behavior in db_stress, up to a small limit in a row. This way, we don't miss a possible regression with excessive TryAgain, and trying again (rolling back the transaction) should have a well renewed chance of success as the writes will be associated with fresh sequence numbers.
Also, some of the APIs were not clear about Transaction semantics, so I have clarified:
* (Best I can tell....) Destroying a Transaction is safe without calling Rollback() (or at least should be). I don't know why it's a common pattern in our test code and examples to rollback before unconditional destruction. Stress test updated not to call Rollback unnecessarily (to test safe destruction).
* Despite essentially doing what is asked, simply trying Commit() again when it returns TryAgain does not have a chance of success, because of the transaction being bound to the DB state at the time of operations before Commit. Similar logic applies to Busy AFAIK. Commit() API comments updated, and expanded unit test in optimistic_transaction_test.
Also also, because I can't stop myself, I refactored a good portion of the transaction handling code in db_stress.
* Avoid existing and new copy-paste for most transaction interactions with a new ExecuteTransaction (higher-order) function.
* Use unique_ptr (nicely complements removing unnecessary Rollbacks)
* Abstract out a pattern for safely calling std::terminate() and use it in more places. (The TryAgain errors we saw did not have stack traces because of "terminate called recursively".)
Intended follow-up: resurrect use of `FLAGS_rollback_one_in` but also include non-trivial cases
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11653
Test Plan:
this is the test :)
Also, temporarily bypassed the new retry logic and boosted the chance of hitting TryAgain. Quickly reproduced the TryAgain error. Then re-enabled the new retry logic, and was not able to hit the error after running for tens of minutes, even with the boosted chances.
Reviewed By: cbi42
Differential Revision: D47882995
Pulled By: pdillinger
fbshipit-source-id: 21eadb1525423340dbf28d17cf166b9583311a0d
2023-07-28 23:25:29 +00:00
|
|
|
thread->shared->SafeTerminate();
|
2019-12-09 07:49:32 +00:00
|
|
|
}
|
|
|
|
s = db_->CreateColumnFamily(ColumnFamilyOptions(options_), new_name,
|
|
|
|
&column_families_[cf]);
|
|
|
|
column_family_names_[cf] = new_name;
|
|
|
|
thread->shared->ClearColumnFamily(cf);
|
|
|
|
if (!s.ok()) {
|
|
|
|
fprintf(stderr, "creating column family error: %s\n",
|
|
|
|
s.ToString().c_str());
|
Allow TryAgain in db_stress with optimistic txn, and refactoring (#11653)
Summary:
In rare cases, optimistic transaction commit returns TryAgain. This change tolerates that intentional behavior in db_stress, up to a small limit in a row. This way, we don't miss a possible regression with excessive TryAgain, and trying again (rolling back the transaction) should have a well renewed chance of success as the writes will be associated with fresh sequence numbers.
Also, some of the APIs were not clear about Transaction semantics, so I have clarified:
* (Best I can tell....) Destroying a Transaction is safe without calling Rollback() (or at least should be). I don't know why it's a common pattern in our test code and examples to rollback before unconditional destruction. Stress test updated not to call Rollback unnecessarily (to test safe destruction).
* Despite essentially doing what is asked, simply trying Commit() again when it returns TryAgain does not have a chance of success, because of the transaction being bound to the DB state at the time of operations before Commit. Similar logic applies to Busy AFAIK. Commit() API comments updated, and expanded unit test in optimistic_transaction_test.
Also also, because I can't stop myself, I refactored a good portion of the transaction handling code in db_stress.
* Avoid existing and new copy-paste for most transaction interactions with a new ExecuteTransaction (higher-order) function.
* Use unique_ptr (nicely complements removing unnecessary Rollbacks)
* Abstract out a pattern for safely calling std::terminate() and use it in more places. (The TryAgain errors we saw did not have stack traces because of "terminate called recursively".)
Intended follow-up: resurrect use of `FLAGS_rollback_one_in` but also include non-trivial cases
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11653
Test Plan:
this is the test :)
Also, temporarily bypassed the new retry logic and boosted the chance of hitting TryAgain. Quickly reproduced the TryAgain error. Then re-enabled the new retry logic, and was not able to hit the error after running for tens of minutes, even with the boosted chances.
Reviewed By: cbi42
Differential Revision: D47882995
Pulled By: pdillinger
fbshipit-source-id: 21eadb1525423340dbf28d17cf166b9583311a0d
2023-07-28 23:25:29 +00:00
|
|
|
thread->shared->SafeTerminate();
|
2019-12-09 07:49:32 +00:00
|
|
|
}
|
|
|
|
thread->shared->UnlockColumnFamily(cf);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-12-09 22:36:10 +00:00
|
|
|
bool ShouldAcquireMutexOnKey() const override { return true; }
|
2019-12-09 07:49:32 +00:00
|
|
|
|
2021-12-07 21:40:46 +00:00
|
|
|
bool IsStateTracked() const override { return true; }
|
|
|
|
|
2024-05-09 22:37:38 +00:00
|
|
|
void TestKeyMayExist(ThreadState* thread, const ReadOptions& read_opts,
|
|
|
|
const std::vector<int>& rand_column_families,
|
|
|
|
const std::vector<int64_t>& rand_keys) override {
|
|
|
|
auto cfh = column_families_[rand_column_families[0]];
|
|
|
|
std::string key_str = Key(rand_keys[0]);
|
|
|
|
Slice key = key_str;
|
|
|
|
std::string ignore;
|
|
|
|
ReadOptions read_opts_copy = read_opts;
|
|
|
|
|
|
|
|
std::string read_ts_str;
|
|
|
|
Slice read_ts_slice;
|
|
|
|
if (FLAGS_user_timestamp_size > 0) {
|
|
|
|
read_ts_str = GetNowNanos();
|
|
|
|
read_ts_slice = read_ts_str;
|
|
|
|
read_opts_copy.timestamp = &read_ts_slice;
|
|
|
|
}
|
|
|
|
bool read_older_ts = MaybeUseOlderTimestampForPointLookup(
|
|
|
|
thread, read_ts_str, read_ts_slice, read_opts_copy);
|
|
|
|
|
|
|
|
const ExpectedValue pre_read_expected_value =
|
|
|
|
thread->shared->Get(rand_column_families[0], rand_keys[0]);
|
|
|
|
bool key_may_exist = db_->KeyMayExist(read_opts_copy, cfh, key, &ignore);
|
|
|
|
const ExpectedValue post_read_expected_value =
|
|
|
|
thread->shared->Get(rand_column_families[0], rand_keys[0]);
|
|
|
|
|
|
|
|
if (!key_may_exist && !FLAGS_skip_verifydb && !read_older_ts) {
|
|
|
|
if (ExpectedValueHelper::MustHaveExisted(pre_read_expected_value,
|
|
|
|
post_read_expected_value)) {
|
|
|
|
thread->shared->SetVerificationFailure();
|
|
|
|
fprintf(stderr,
|
|
|
|
"error : inconsistent values for key %s: expected state has "
|
|
|
|
"the key, TestKeyMayExist() returns false indicating the key "
|
|
|
|
"must not exist.\n",
|
|
|
|
key.ToString(true).c_str());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-12-09 22:36:10 +00:00
|
|
|
Status TestGet(ThreadState* thread, const ReadOptions& read_opts,
|
|
|
|
const std::vector<int>& rand_column_families,
|
|
|
|
const std::vector<int64_t>& rand_keys) override {
|
2019-12-09 07:49:32 +00:00
|
|
|
auto cfh = column_families_[rand_column_families[0]];
|
|
|
|
std::string key_str = Key(rand_keys[0]);
|
|
|
|
Slice key = key_str;
|
|
|
|
std::string from_db;
|
2022-07-05 20:30:15 +00:00
|
|
|
|
|
|
|
ReadOptions read_opts_copy = read_opts;
|
|
|
|
std::string read_ts_str;
|
|
|
|
Slice read_ts_slice;
|
2023-02-24 01:00:04 +00:00
|
|
|
if (FLAGS_user_timestamp_size > 0) {
|
|
|
|
read_ts_str = GetNowNanos();
|
|
|
|
read_ts_slice = read_ts_str;
|
|
|
|
read_opts_copy.timestamp = &read_ts_slice;
|
|
|
|
}
|
2022-09-30 23:13:03 +00:00
|
|
|
bool read_older_ts = MaybeUseOlderTimestampForPointLookup(
|
|
|
|
thread, read_ts_str, read_ts_slice, read_opts_copy);
|
2022-07-05 20:30:15 +00:00
|
|
|
|
Inject more errors to more files in stress test (#12713)
Summary:
**Context:**
We currently have partial error injection:
- DB operation: all read, SST write
- DB open: all read, SST write, all metadata write.
This PR completes the error injection (with some limitations below):
- DB operation & open: all read, all write, all metadata write, all metadata read
**Summary:**
- Inject retryable metadata read, metadata write error concerning directory (e.g, dir sync, ) or file metadata (e.g, name, size, file creation/deletion...)
- Inject retryable errors to all major file types: random access file, sequential file, writable file
- Allow db stress test operations to handle above injected errors gracefully without crashing
- Change all error injection to thread-local implementation for easier disabling and enabling in the same thread. For example, we can control error handling thread to have no error injection. It's also cleaner in code.
- Limitation: compared to before, we now don't have write fault injection for backup/restore CopyOrCreateFiles work threads since they use anonymous background threads as well as read injection for db open bg thread
- Add a new flag to test error recovery without error injection so we can test the path where error recovery actually succeeds
- Some Refactory & fix to db stress test framework (see PR review comments)
- Fix some minor bugs surfaced (see PR review comments)
- Limitation: had to disable backup restore with metadata read/write injection since it surfaces too many testing issues. Will add it back later to focus on surfacing actual code/internal bugs first.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12713
Test Plan:
- Existing UT
- CI with no trivial error failure
Reviewed By: pdillinger
Differential Revision: D58326608
Pulled By: hx235
fbshipit-source-id: 011b5195aaeb6011641ae0a9194f7f2a0e325ad7
2024-06-19 15:42:00 +00:00
|
|
|
if (fault_fs_guard) {
|
|
|
|
fault_fs_guard->GetAndResetInjectedThreadLocalErrorCount(
|
|
|
|
FaultInjectionIOType::kRead);
|
|
|
|
fault_fs_guard->GetAndResetInjectedThreadLocalErrorCount(
|
|
|
|
FaultInjectionIOType::kMetadataRead);
|
2024-06-22 03:11:57 +00:00
|
|
|
SharedState::ignore_read_error = false;
|
Inject more errors to more files in stress test (#12713)
Summary:
**Context:**
We currently have partial error injection:
- DB operation: all read, SST write
- DB open: all read, SST write, all metadata write.
This PR completes the error injection (with some limitations below):
- DB operation & open: all read, all write, all metadata write, all metadata read
**Summary:**
- Inject retryable metadata read, metadata write error concerning directory (e.g, dir sync, ) or file metadata (e.g, name, size, file creation/deletion...)
- Inject retryable errors to all major file types: random access file, sequential file, writable file
- Allow db stress test operations to handle above injected errors gracefully without crashing
- Change all error injection to thread-local implementation for easier disabling and enabling in the same thread. For example, we can control error handling thread to have no error injection. It's also cleaner in code.
- Limitation: compared to before, we now don't have write fault injection for backup/restore CopyOrCreateFiles work threads since they use anonymous background threads as well as read injection for db open bg thread
- Add a new flag to test error recovery without error injection so we can test the path where error recovery actually succeeds
- Some Refactory & fix to db stress test framework (see PR review comments)
- Fix some minor bugs surfaced (see PR review comments)
- Limitation: had to disable backup restore with metadata read/write injection since it surfaces too many testing issues. Will add it back later to focus on surfacing actual code/internal bugs first.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12713
Test Plan:
- Existing UT
- CI with no trivial error failure
Reviewed By: pdillinger
Differential Revision: D58326608
Pulled By: hx235
fbshipit-source-id: 011b5195aaeb6011641ae0a9194f7f2a0e325ad7
2024-06-19 15:42:00 +00:00
|
|
|
}
|
|
|
|
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
const ExpectedValue pre_read_expected_value =
|
|
|
|
thread->shared->Get(rand_column_families[0], rand_keys[0]);
|
2022-07-05 20:30:15 +00:00
|
|
|
Status s = db_->Get(read_opts_copy, cfh, key, &from_db);
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
const ExpectedValue post_read_expected_value =
|
|
|
|
thread->shared->Get(rand_column_families[0], rand_keys[0]);
|
Inject more errors to more files in stress test (#12713)
Summary:
**Context:**
We currently have partial error injection:
- DB operation: all read, SST write
- DB open: all read, SST write, all metadata write.
This PR completes the error injection (with some limitations below):
- DB operation & open: all read, all write, all metadata write, all metadata read
**Summary:**
- Inject retryable metadata read, metadata write error concerning directory (e.g, dir sync, ) or file metadata (e.g, name, size, file creation/deletion...)
- Inject retryable errors to all major file types: random access file, sequential file, writable file
- Allow db stress test operations to handle above injected errors gracefully without crashing
- Change all error injection to thread-local implementation for easier disabling and enabling in the same thread. For example, we can control error handling thread to have no error injection. It's also cleaner in code.
- Limitation: compared to before, we now don't have write fault injection for backup/restore CopyOrCreateFiles work threads since they use anonymous background threads as well as read injection for db open bg thread
- Add a new flag to test error recovery without error injection so we can test the path where error recovery actually succeeds
- Some Refactory & fix to db stress test framework (see PR review comments)
- Fix some minor bugs surfaced (see PR review comments)
- Limitation: had to disable backup restore with metadata read/write injection since it surfaces too many testing issues. Will add it back later to focus on surfacing actual code/internal bugs first.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12713
Test Plan:
- Existing UT
- CI with no trivial error failure
Reviewed By: pdillinger
Differential Revision: D58326608
Pulled By: hx235
fbshipit-source-id: 011b5195aaeb6011641ae0a9194f7f2a0e325ad7
2024-06-19 15:42:00 +00:00
|
|
|
|
|
|
|
int injected_error_count = 0;
|
2024-06-24 04:54:27 +00:00
|
|
|
if (fault_fs_guard) {
|
Inject more errors to more files in stress test (#12713)
Summary:
**Context:**
We currently have partial error injection:
- DB operation: all read, SST write
- DB open: all read, SST write, all metadata write.
This PR completes the error injection (with some limitations below):
- DB operation & open: all read, all write, all metadata write, all metadata read
**Summary:**
- Inject retryable metadata read, metadata write error concerning directory (e.g, dir sync, ) or file metadata (e.g, name, size, file creation/deletion...)
- Inject retryable errors to all major file types: random access file, sequential file, writable file
- Allow db stress test operations to handle above injected errors gracefully without crashing
- Change all error injection to thread-local implementation for easier disabling and enabling in the same thread. For example, we can control error handling thread to have no error injection. It's also cleaner in code.
- Limitation: compared to before, we now don't have write fault injection for backup/restore CopyOrCreateFiles work threads since they use anonymous background threads as well as read injection for db open bg thread
- Add a new flag to test error recovery without error injection so we can test the path where error recovery actually succeeds
- Some Refactory & fix to db stress test framework (see PR review comments)
- Fix some minor bugs surfaced (see PR review comments)
- Limitation: had to disable backup restore with metadata read/write injection since it surfaces too many testing issues. Will add it back later to focus on surfacing actual code/internal bugs first.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12713
Test Plan:
- Existing UT
- CI with no trivial error failure
Reviewed By: pdillinger
Differential Revision: D58326608
Pulled By: hx235
fbshipit-source-id: 011b5195aaeb6011641ae0a9194f7f2a0e325ad7
2024-06-19 15:42:00 +00:00
|
|
|
injected_error_count = GetMinInjectedErrorCount(
|
|
|
|
fault_fs_guard->GetAndResetInjectedThreadLocalErrorCount(
|
|
|
|
FaultInjectionIOType::kRead),
|
|
|
|
fault_fs_guard->GetAndResetInjectedThreadLocalErrorCount(
|
|
|
|
FaultInjectionIOType::kMetadataRead));
|
2024-06-24 04:54:27 +00:00
|
|
|
if (!SharedState::ignore_read_error && injected_error_count > 0 &&
|
|
|
|
(s.ok() || s.IsNotFound())) {
|
Inject more errors to more files in stress test (#12713)
Summary:
**Context:**
We currently have partial error injection:
- DB operation: all read, SST write
- DB open: all read, SST write, all metadata write.
This PR completes the error injection (with some limitations below):
- DB operation & open: all read, all write, all metadata write, all metadata read
**Summary:**
- Inject retryable metadata read, metadata write error concerning directory (e.g, dir sync, ) or file metadata (e.g, name, size, file creation/deletion...)
- Inject retryable errors to all major file types: random access file, sequential file, writable file
- Allow db stress test operations to handle above injected errors gracefully without crashing
- Change all error injection to thread-local implementation for easier disabling and enabling in the same thread. For example, we can control error handling thread to have no error injection. It's also cleaner in code.
- Limitation: compared to before, we now don't have write fault injection for backup/restore CopyOrCreateFiles work threads since they use anonymous background threads as well as read injection for db open bg thread
- Add a new flag to test error recovery without error injection so we can test the path where error recovery actually succeeds
- Some Refactory & fix to db stress test framework (see PR review comments)
- Fix some minor bugs surfaced (see PR review comments)
- Limitation: had to disable backup restore with metadata read/write injection since it surfaces too many testing issues. Will add it back later to focus on surfacing actual code/internal bugs first.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12713
Test Plan:
- Existing UT
- CI with no trivial error failure
Reviewed By: pdillinger
Differential Revision: D58326608
Pulled By: hx235
fbshipit-source-id: 011b5195aaeb6011641ae0a9194f7f2a0e325ad7
2024-06-19 15:42:00 +00:00
|
|
|
// Grab mutex so multiple thread don't try to print the
|
|
|
|
// stack trace at the same time
|
|
|
|
MutexLock l(thread->shared->GetMutex());
|
|
|
|
fprintf(stderr, "Didn't get expected error from Get\n");
|
|
|
|
fprintf(stderr, "Callstack that injected the fault\n");
|
|
|
|
fault_fs_guard->PrintInjectedThreadLocalErrorBacktrace(
|
|
|
|
FaultInjectionIOType::kRead);
|
|
|
|
fault_fs_guard->PrintInjectedThreadLocalErrorBacktrace(
|
|
|
|
FaultInjectionIOType::kMetadataRead);
|
|
|
|
std::terminate();
|
|
|
|
}
|
2020-04-11 00:18:56 +00:00
|
|
|
}
|
Inject more errors to more files in stress test (#12713)
Summary:
**Context:**
We currently have partial error injection:
- DB operation: all read, SST write
- DB open: all read, SST write, all metadata write.
This PR completes the error injection (with some limitations below):
- DB operation & open: all read, all write, all metadata write, all metadata read
**Summary:**
- Inject retryable metadata read, metadata write error concerning directory (e.g, dir sync, ) or file metadata (e.g, name, size, file creation/deletion...)
- Inject retryable errors to all major file types: random access file, sequential file, writable file
- Allow db stress test operations to handle above injected errors gracefully without crashing
- Change all error injection to thread-local implementation for easier disabling and enabling in the same thread. For example, we can control error handling thread to have no error injection. It's also cleaner in code.
- Limitation: compared to before, we now don't have write fault injection for backup/restore CopyOrCreateFiles work threads since they use anonymous background threads as well as read injection for db open bg thread
- Add a new flag to test error recovery without error injection so we can test the path where error recovery actually succeeds
- Some Refactory & fix to db stress test framework (see PR review comments)
- Fix some minor bugs surfaced (see PR review comments)
- Limitation: had to disable backup restore with metadata read/write injection since it surfaces too many testing issues. Will add it back later to focus on surfacing actual code/internal bugs first.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12713
Test Plan:
- Existing UT
- CI with no trivial error failure
Reviewed By: pdillinger
Differential Revision: D58326608
Pulled By: hx235
fbshipit-source-id: 011b5195aaeb6011641ae0a9194f7f2a0e325ad7
2024-06-19 15:42:00 +00:00
|
|
|
|
2019-12-09 07:49:32 +00:00
|
|
|
if (s.ok()) {
|
|
|
|
// found case
|
|
|
|
thread->stats.AddGets(1, 1);
|
2022-08-24 00:08:14 +00:00
|
|
|
// we only have the latest expected state
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
if (!FLAGS_skip_verifydb && !read_older_ts) {
|
|
|
|
if (ExpectedValueHelper::MustHaveNotExisted(pre_read_expected_value,
|
|
|
|
post_read_expected_value)) {
|
|
|
|
thread->shared->SetVerificationFailure();
|
|
|
|
fprintf(stderr,
|
Optimize compaction for standalone range deletion files (#13078)
Summary:
This PR adds some optimization for compacting standalone range deletion files. A standalone range deletion file is one with just a single range deletion. Currently, such a file is used in bulk loading to achieve something like atomically delete old version of all data with one big range deletion and adding new version of data. These are the changes included in the PR:
1) When a standalone range deletion file is ingested via bulk loading, it's marked for compaction.
2) When picking input files during compaction picking, we attempt to only pick a standalone range deletion file when oldest snapshot is at or above the file's seqno. To do this, `PickCompaction` API is updated to take existing snapshots as an input. This is only done for the universal compaction + UDT disabled combination, we save querying for existing snapshots and not pass it for all other cases.
3) At `Compaction` construction time, the input files will be filtered to examine if any of them can be skipped for compaction iterator. For example, if all the data of the file is deleted by a standalone range tombstone, and the oldest snapshot is at or above such range tombstone, this file will be filtered out.
4) Every time a snapshot is released, we examine if any column family has standalone range deletion files that becomes eligible to be scheduled for compaction. And schedule one for it.
Potential future improvements:
- Add some dedicated statistics for the filtered files.
- Extend this input filtering to L0 files' compactions cases when a newer L0 file could shadow an older L0 file
Pull Request resolved: https://github.com/facebook/rocksdb/pull/13078
Test Plan: Added unit tests and stress tested a few rounds
Reviewed By: cbi42
Differential Revision: D64879415
Pulled By: jowlyzhang
fbshipit-source-id: 02b8683fddbe11f093bcaa0a38406deb39f44d9e
2024-10-25 16:32:14 +00:00
|
|
|
"error : inconsistent values for key %s (%" PRIi64
|
|
|
|
"): Get returns %s, "
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
"but expected state is \"deleted\".\n",
|
Optimize compaction for standalone range deletion files (#13078)
Summary:
This PR adds some optimization for compacting standalone range deletion files. A standalone range deletion file is one with just a single range deletion. Currently, such a file is used in bulk loading to achieve something like atomically delete old version of all data with one big range deletion and adding new version of data. These are the changes included in the PR:
1) When a standalone range deletion file is ingested via bulk loading, it's marked for compaction.
2) When picking input files during compaction picking, we attempt to only pick a standalone range deletion file when oldest snapshot is at or above the file's seqno. To do this, `PickCompaction` API is updated to take existing snapshots as an input. This is only done for the universal compaction + UDT disabled combination, we save querying for existing snapshots and not pass it for all other cases.
3) At `Compaction` construction time, the input files will be filtered to examine if any of them can be skipped for compaction iterator. For example, if all the data of the file is deleted by a standalone range tombstone, and the oldest snapshot is at or above such range tombstone, this file will be filtered out.
4) Every time a snapshot is released, we examine if any column family has standalone range deletion files that becomes eligible to be scheduled for compaction. And schedule one for it.
Potential future improvements:
- Add some dedicated statistics for the filtered files.
- Extend this input filtering to L0 files' compactions cases when a newer L0 file could shadow an older L0 file
Pull Request resolved: https://github.com/facebook/rocksdb/pull/13078
Test Plan: Added unit tests and stress tested a few rounds
Reviewed By: cbi42
Differential Revision: D64879415
Pulled By: jowlyzhang
fbshipit-source-id: 02b8683fddbe11f093bcaa0a38406deb39f44d9e
2024-10-25 16:32:14 +00:00
|
|
|
key.ToString(true).c_str(), rand_keys[0],
|
|
|
|
StringToHex(from_db).c_str());
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
}
|
|
|
|
Slice from_db_slice(from_db);
|
|
|
|
uint32_t value_base_from_db = GetValueBase(from_db_slice);
|
|
|
|
if (!ExpectedValueHelper::InExpectedValueBaseRange(
|
|
|
|
value_base_from_db, pre_read_expected_value,
|
|
|
|
post_read_expected_value)) {
|
|
|
|
thread->shared->SetVerificationFailure();
|
|
|
|
fprintf(stderr,
|
Optimize compaction for standalone range deletion files (#13078)
Summary:
This PR adds some optimization for compacting standalone range deletion files. A standalone range deletion file is one with just a single range deletion. Currently, such a file is used in bulk loading to achieve something like atomically delete old version of all data with one big range deletion and adding new version of data. These are the changes included in the PR:
1) When a standalone range deletion file is ingested via bulk loading, it's marked for compaction.
2) When picking input files during compaction picking, we attempt to only pick a standalone range deletion file when oldest snapshot is at or above the file's seqno. To do this, `PickCompaction` API is updated to take existing snapshots as an input. This is only done for the universal compaction + UDT disabled combination, we save querying for existing snapshots and not pass it for all other cases.
3) At `Compaction` construction time, the input files will be filtered to examine if any of them can be skipped for compaction iterator. For example, if all the data of the file is deleted by a standalone range tombstone, and the oldest snapshot is at or above such range tombstone, this file will be filtered out.
4) Every time a snapshot is released, we examine if any column family has standalone range deletion files that becomes eligible to be scheduled for compaction. And schedule one for it.
Potential future improvements:
- Add some dedicated statistics for the filtered files.
- Extend this input filtering to L0 files' compactions cases when a newer L0 file could shadow an older L0 file
Pull Request resolved: https://github.com/facebook/rocksdb/pull/13078
Test Plan: Added unit tests and stress tested a few rounds
Reviewed By: cbi42
Differential Revision: D64879415
Pulled By: jowlyzhang
fbshipit-source-id: 02b8683fddbe11f093bcaa0a38406deb39f44d9e
2024-10-25 16:32:14 +00:00
|
|
|
"error : inconsistent values for key %s (%" PRIi64
|
|
|
|
"): Get returns %s with "
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
"value base %d that falls out of expected state's value base "
|
|
|
|
"range.\n",
|
Optimize compaction for standalone range deletion files (#13078)
Summary:
This PR adds some optimization for compacting standalone range deletion files. A standalone range deletion file is one with just a single range deletion. Currently, such a file is used in bulk loading to achieve something like atomically delete old version of all data with one big range deletion and adding new version of data. These are the changes included in the PR:
1) When a standalone range deletion file is ingested via bulk loading, it's marked for compaction.
2) When picking input files during compaction picking, we attempt to only pick a standalone range deletion file when oldest snapshot is at or above the file's seqno. To do this, `PickCompaction` API is updated to take existing snapshots as an input. This is only done for the universal compaction + UDT disabled combination, we save querying for existing snapshots and not pass it for all other cases.
3) At `Compaction` construction time, the input files will be filtered to examine if any of them can be skipped for compaction iterator. For example, if all the data of the file is deleted by a standalone range tombstone, and the oldest snapshot is at or above such range tombstone, this file will be filtered out.
4) Every time a snapshot is released, we examine if any column family has standalone range deletion files that becomes eligible to be scheduled for compaction. And schedule one for it.
Potential future improvements:
- Add some dedicated statistics for the filtered files.
- Extend this input filtering to L0 files' compactions cases when a newer L0 file could shadow an older L0 file
Pull Request resolved: https://github.com/facebook/rocksdb/pull/13078
Test Plan: Added unit tests and stress tested a few rounds
Reviewed By: cbi42
Differential Revision: D64879415
Pulled By: jowlyzhang
fbshipit-source-id: 02b8683fddbe11f093bcaa0a38406deb39f44d9e
2024-10-25 16:32:14 +00:00
|
|
|
key.ToString(true).c_str(), rand_keys[0],
|
|
|
|
StringToHex(from_db).c_str(), value_base_from_db);
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
}
|
2022-08-24 00:08:14 +00:00
|
|
|
}
|
2019-12-09 07:49:32 +00:00
|
|
|
} else if (s.IsNotFound()) {
|
|
|
|
// not found case
|
|
|
|
thread->stats.AddGets(1, 0);
|
2022-09-30 23:13:03 +00:00
|
|
|
if (!FLAGS_skip_verifydb && !read_older_ts) {
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
if (ExpectedValueHelper::MustHaveExisted(pre_read_expected_value,
|
|
|
|
post_read_expected_value)) {
|
2022-08-24 00:08:14 +00:00
|
|
|
thread->shared->SetVerificationFailure();
|
|
|
|
fprintf(stderr,
|
Optimize compaction for standalone range deletion files (#13078)
Summary:
This PR adds some optimization for compacting standalone range deletion files. A standalone range deletion file is one with just a single range deletion. Currently, such a file is used in bulk loading to achieve something like atomically delete old version of all data with one big range deletion and adding new version of data. These are the changes included in the PR:
1) When a standalone range deletion file is ingested via bulk loading, it's marked for compaction.
2) When picking input files during compaction picking, we attempt to only pick a standalone range deletion file when oldest snapshot is at or above the file's seqno. To do this, `PickCompaction` API is updated to take existing snapshots as an input. This is only done for the universal compaction + UDT disabled combination, we save querying for existing snapshots and not pass it for all other cases.
3) At `Compaction` construction time, the input files will be filtered to examine if any of them can be skipped for compaction iterator. For example, if all the data of the file is deleted by a standalone range tombstone, and the oldest snapshot is at or above such range tombstone, this file will be filtered out.
4) Every time a snapshot is released, we examine if any column family has standalone range deletion files that becomes eligible to be scheduled for compaction. And schedule one for it.
Potential future improvements:
- Add some dedicated statistics for the filtered files.
- Extend this input filtering to L0 files' compactions cases when a newer L0 file could shadow an older L0 file
Pull Request resolved: https://github.com/facebook/rocksdb/pull/13078
Test Plan: Added unit tests and stress tested a few rounds
Reviewed By: cbi42
Differential Revision: D64879415
Pulled By: jowlyzhang
fbshipit-source-id: 02b8683fddbe11f093bcaa0a38406deb39f44d9e
2024-10-25 16:32:14 +00:00
|
|
|
"error : inconsistent values for key %s (%" PRIi64
|
|
|
|
"): expected state has "
|
2022-08-24 00:08:14 +00:00
|
|
|
"the key, Get() returns NotFound.\n",
|
Optimize compaction for standalone range deletion files (#13078)
Summary:
This PR adds some optimization for compacting standalone range deletion files. A standalone range deletion file is one with just a single range deletion. Currently, such a file is used in bulk loading to achieve something like atomically delete old version of all data with one big range deletion and adding new version of data. These are the changes included in the PR:
1) When a standalone range deletion file is ingested via bulk loading, it's marked for compaction.
2) When picking input files during compaction picking, we attempt to only pick a standalone range deletion file when oldest snapshot is at or above the file's seqno. To do this, `PickCompaction` API is updated to take existing snapshots as an input. This is only done for the universal compaction + UDT disabled combination, we save querying for existing snapshots and not pass it for all other cases.
3) At `Compaction` construction time, the input files will be filtered to examine if any of them can be skipped for compaction iterator. For example, if all the data of the file is deleted by a standalone range tombstone, and the oldest snapshot is at or above such range tombstone, this file will be filtered out.
4) Every time a snapshot is released, we examine if any column family has standalone range deletion files that becomes eligible to be scheduled for compaction. And schedule one for it.
Potential future improvements:
- Add some dedicated statistics for the filtered files.
- Extend this input filtering to L0 files' compactions cases when a newer L0 file could shadow an older L0 file
Pull Request resolved: https://github.com/facebook/rocksdb/pull/13078
Test Plan: Added unit tests and stress tested a few rounds
Reviewed By: cbi42
Differential Revision: D64879415
Pulled By: jowlyzhang
fbshipit-source-id: 02b8683fddbe11f093bcaa0a38406deb39f44d9e
2024-10-25 16:32:14 +00:00
|
|
|
key.ToString(true).c_str(), rand_keys[0]);
|
2022-08-24 00:08:14 +00:00
|
|
|
}
|
|
|
|
}
|
2024-06-25 03:51:39 +00:00
|
|
|
} else if (injected_error_count == 0 || !IsErrorInjectedAndRetryable(s)) {
|
Inject more errors to more files in stress test (#12713)
Summary:
**Context:**
We currently have partial error injection:
- DB operation: all read, SST write
- DB open: all read, SST write, all metadata write.
This PR completes the error injection (with some limitations below):
- DB operation & open: all read, all write, all metadata write, all metadata read
**Summary:**
- Inject retryable metadata read, metadata write error concerning directory (e.g, dir sync, ) or file metadata (e.g, name, size, file creation/deletion...)
- Inject retryable errors to all major file types: random access file, sequential file, writable file
- Allow db stress test operations to handle above injected errors gracefully without crashing
- Change all error injection to thread-local implementation for easier disabling and enabling in the same thread. For example, we can control error handling thread to have no error injection. It's also cleaner in code.
- Limitation: compared to before, we now don't have write fault injection for backup/restore CopyOrCreateFiles work threads since they use anonymous background threads as well as read injection for db open bg thread
- Add a new flag to test error recovery without error injection so we can test the path where error recovery actually succeeds
- Some Refactory & fix to db stress test framework (see PR review comments)
- Fix some minor bugs surfaced (see PR review comments)
- Limitation: had to disable backup restore with metadata read/write injection since it surfaces too many testing issues. Will add it back later to focus on surfacing actual code/internal bugs first.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12713
Test Plan:
- Existing UT
- CI with no trivial error failure
Reviewed By: pdillinger
Differential Revision: D58326608
Pulled By: hx235
fbshipit-source-id: 011b5195aaeb6011641ae0a9194f7f2a0e325ad7
2024-06-19 15:42:00 +00:00
|
|
|
thread->shared->SetVerificationFailure();
|
Optimize compaction for standalone range deletion files (#13078)
Summary:
This PR adds some optimization for compacting standalone range deletion files. A standalone range deletion file is one with just a single range deletion. Currently, such a file is used in bulk loading to achieve something like atomically delete old version of all data with one big range deletion and adding new version of data. These are the changes included in the PR:
1) When a standalone range deletion file is ingested via bulk loading, it's marked for compaction.
2) When picking input files during compaction picking, we attempt to only pick a standalone range deletion file when oldest snapshot is at or above the file's seqno. To do this, `PickCompaction` API is updated to take existing snapshots as an input. This is only done for the universal compaction + UDT disabled combination, we save querying for existing snapshots and not pass it for all other cases.
3) At `Compaction` construction time, the input files will be filtered to examine if any of them can be skipped for compaction iterator. For example, if all the data of the file is deleted by a standalone range tombstone, and the oldest snapshot is at or above such range tombstone, this file will be filtered out.
4) Every time a snapshot is released, we examine if any column family has standalone range deletion files that becomes eligible to be scheduled for compaction. And schedule one for it.
Potential future improvements:
- Add some dedicated statistics for the filtered files.
- Extend this input filtering to L0 files' compactions cases when a newer L0 file could shadow an older L0 file
Pull Request resolved: https://github.com/facebook/rocksdb/pull/13078
Test Plan: Added unit tests and stress tested a few rounds
Reviewed By: cbi42
Differential Revision: D64879415
Pulled By: jowlyzhang
fbshipit-source-id: 02b8683fddbe11f093bcaa0a38406deb39f44d9e
2024-10-25 16:32:14 +00:00
|
|
|
fprintf(stderr, "error : Get() returns %s for key: %s (%" PRIi64 ").\n",
|
|
|
|
s.ToString().c_str(), key.ToString(true).c_str(), rand_keys[0]);
|
2020-04-11 00:18:56 +00:00
|
|
|
}
|
2019-12-09 07:49:32 +00:00
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2019-12-09 22:36:10 +00:00
|
|
|
std::vector<Status> TestMultiGet(
|
2019-12-09 07:49:32 +00:00
|
|
|
ThreadState* thread, const ReadOptions& read_opts,
|
|
|
|
const std::vector<int>& rand_column_families,
|
2019-12-09 22:36:10 +00:00
|
|
|
const std::vector<int64_t>& rand_keys) override {
|
2019-12-09 07:49:32 +00:00
|
|
|
size_t num_keys = rand_keys.size();
|
|
|
|
std::vector<std::string> key_str;
|
|
|
|
std::vector<Slice> keys;
|
|
|
|
key_str.reserve(num_keys);
|
|
|
|
keys.reserve(num_keys);
|
|
|
|
std::vector<PinnableSlice> values(num_keys);
|
|
|
|
std::vector<Status> statuses(num_keys);
|
2023-05-22 19:31:52 +00:00
|
|
|
// When Flags_use_txn is enabled, we also do a read your write check.
|
2024-05-23 23:47:39 +00:00
|
|
|
std::unordered_map<std::string, ExpectedValue> ryw_expected_values;
|
2023-05-22 19:31:52 +00:00
|
|
|
|
|
|
|
SharedState* shared = thread->shared;
|
2024-05-24 21:24:17 +00:00
|
|
|
assert(shared);
|
2023-05-22 19:31:52 +00:00
|
|
|
|
|
|
|
int column_family = rand_column_families[0];
|
|
|
|
ColumnFamilyHandle* cfh = column_families_[column_family];
|
2024-04-15 23:11:58 +00:00
|
|
|
|
|
|
|
bool do_consistency_check = FLAGS_check_multiget_consistency;
|
2020-05-19 19:41:43 +00:00
|
|
|
|
|
|
|
ReadOptions readoptionscopy = read_opts;
|
Group rocksdb.sst.read.micros stat by different user read IOActivity + misc (#11444)
Summary:
**Context/Summary:**
- Similar to https://github.com/facebook/rocksdb/pull/11288 but for user read such as `Get(), MultiGet(), DBIterator::XXX(), Verify(File)Checksum()`.
- For this, I refactored some user-facing `MultiGet` calls in `TransactionBase` and various types of `DB` so that it does not call a user-facing `Get()` but `GetImpl()` for passing the `ReadOptions::io_activity` check (see PR conversation)
- New user read stats breakdown are guarded by `kExceptDetailedTimers` since measurement shows they have 4-5% regression to the upstream/main.
- Misc
- More refactoring: with https://github.com/facebook/rocksdb/pull/11288, we complete passing `ReadOptions/IOOptions` to FS level. So we can now replace the previously [added](https://github.com/facebook/rocksdb/pull/9424) `rate_limiter_priority` parameter in `RandomAccessFileReader`'s `Read/MultiRead/Prefetch()` with `IOOptions::rate_limiter_priority`
- Also, `ReadAsync()` call time is measured in `SST_READ_MICRO` now
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11444
Test Plan:
- CI fake db crash/stress test
- Microbenchmarking
**Build** `make clean && ROCKSDB_NO_FBCODE=1 DEBUG_LEVEL=0 make -jN db_basic_bench`
- google benchmark version: https://github.com/google/benchmark/commit/604f6fd3f4b34a84ec4eb4db81d842fa4db829cd
- db_basic_bench_base: upstream
- db_basic_bench_pr: db_basic_bench_base + this PR
- asyncread_db_basic_bench_base: upstream + [db basic bench patch for IteratorNext](https://github.com/facebook/rocksdb/compare/main...hx235:rocksdb:micro_bench_async_read)
- asyncread_db_basic_bench_pr: asyncread_db_basic_bench_base + this PR
**Test**
Get
```
TEST_TMPDIR=/dev/shm ./db_basic_bench_{null_stat|base|pr} --benchmark_filter=DBGet/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/negative_query:0/enable_filter:0/mmap:1/threads:1 --benchmark_repetitions=1000
```
Result
```
Coming soon
```
AsyncRead
```
TEST_TMPDIR=/dev/shm ./asyncread_db_basic_bench_{base|pr} --benchmark_filter=IteratorNext/comp_style:0/max_data:134217728/per_key_size:256/enable_statistics:1/async_io:1/include_detailed_timers:0 --benchmark_repetitions=1000 > syncread_db_basic_bench_{base|pr}.out
```
Result
```
Base:
1956,1956,1968,1977,1979,1986,1988,1988,1988,1990,1991,1991,1993,1993,1993,1993,1994,1996,1997,1997,1997,1998,1999,2001,2001,2002,2004,2007,2007,2008,
PR (2.3% regression, due to measuring `SST_READ_MICRO` that wasn't measured before):
1993,2014,2016,2022,2024,2027,2027,2028,2028,2030,2031,2031,2032,2032,2038,2039,2042,2044,2044,2047,2047,2047,2048,2049,2050,2052,2052,2052,2053,2053,
```
Reviewed By: ajkr
Differential Revision: D45918925
Pulled By: hx235
fbshipit-source-id: 58a54560d9ebeb3a59b6d807639692614dad058a
2023-08-09 00:26:50 +00:00
|
|
|
|
2020-05-19 19:41:43 +00:00
|
|
|
if (do_consistency_check) {
|
|
|
|
readoptionscopy.snapshot = db_->GetSnapshot();
|
|
|
|
}
|
2022-07-05 20:30:15 +00:00
|
|
|
|
|
|
|
std::string read_ts_str;
|
|
|
|
Slice read_ts_slice;
|
|
|
|
MaybeUseOlderTimestampForPointLookup(thread, read_ts_str, read_ts_slice,
|
|
|
|
readoptionscopy);
|
|
|
|
|
2022-06-17 23:40:47 +00:00
|
|
|
readoptionscopy.rate_limiter_priority =
|
|
|
|
FLAGS_rate_limit_user_ops ? Env::IO_USER : Env::IO_TOTAL;
|
2019-12-09 07:49:32 +00:00
|
|
|
|
2019-12-25 02:45:11 +00:00
|
|
|
// To appease clang analyzer
|
|
|
|
const bool use_txn = FLAGS_use_txn;
|
|
|
|
|
2019-12-21 07:10:58 +00:00
|
|
|
// Create a transaction in order to write some data. The purpose is to
|
|
|
|
// exercise WriteBatchWithIndex::MultiGetFromBatchAndDB. The transaction
|
|
|
|
// will be rolled back once MultiGet returns.
|
Allow TryAgain in db_stress with optimistic txn, and refactoring (#11653)
Summary:
In rare cases, optimistic transaction commit returns TryAgain. This change tolerates that intentional behavior in db_stress, up to a small limit in a row. This way, we don't miss a possible regression with excessive TryAgain, and trying again (rolling back the transaction) should have a well renewed chance of success as the writes will be associated with fresh sequence numbers.
Also, some of the APIs were not clear about Transaction semantics, so I have clarified:
* (Best I can tell....) Destroying a Transaction is safe without calling Rollback() (or at least should be). I don't know why it's a common pattern in our test code and examples to rollback before unconditional destruction. Stress test updated not to call Rollback unnecessarily (to test safe destruction).
* Despite essentially doing what is asked, simply trying Commit() again when it returns TryAgain does not have a chance of success, because of the transaction being bound to the DB state at the time of operations before Commit. Similar logic applies to Busy AFAIK. Commit() API comments updated, and expanded unit test in optimistic_transaction_test.
Also also, because I can't stop myself, I refactored a good portion of the transaction handling code in db_stress.
* Avoid existing and new copy-paste for most transaction interactions with a new ExecuteTransaction (higher-order) function.
* Use unique_ptr (nicely complements removing unnecessary Rollbacks)
* Abstract out a pattern for safely calling std::terminate() and use it in more places. (The TryAgain errors we saw did not have stack traces because of "terminate called recursively".)
Intended follow-up: resurrect use of `FLAGS_rollback_one_in` but also include non-trivial cases
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11653
Test Plan:
this is the test :)
Also, temporarily bypassed the new retry logic and boosted the chance of hitting TryAgain. Quickly reproduced the TryAgain error. Then re-enabled the new retry logic, and was not able to hit the error after running for tens of minutes, even with the boosted chances.
Reviewed By: cbi42
Differential Revision: D47882995
Pulled By: pdillinger
fbshipit-source-id: 21eadb1525423340dbf28d17cf166b9583311a0d
2023-07-28 23:25:29 +00:00
|
|
|
std::unique_ptr<Transaction> txn;
|
2019-12-25 02:45:11 +00:00
|
|
|
if (use_txn) {
|
Inject more errors to more files in stress test (#12713)
Summary:
**Context:**
We currently have partial error injection:
- DB operation: all read, SST write
- DB open: all read, SST write, all metadata write.
This PR completes the error injection (with some limitations below):
- DB operation & open: all read, all write, all metadata write, all metadata read
**Summary:**
- Inject retryable metadata read, metadata write error concerning directory (e.g, dir sync, ) or file metadata (e.g, name, size, file creation/deletion...)
- Inject retryable errors to all major file types: random access file, sequential file, writable file
- Allow db stress test operations to handle above injected errors gracefully without crashing
- Change all error injection to thread-local implementation for easier disabling and enabling in the same thread. For example, we can control error handling thread to have no error injection. It's also cleaner in code.
- Limitation: compared to before, we now don't have write fault injection for backup/restore CopyOrCreateFiles work threads since they use anonymous background threads as well as read injection for db open bg thread
- Add a new flag to test error recovery without error injection so we can test the path where error recovery actually succeeds
- Some Refactory & fix to db stress test framework (see PR review comments)
- Fix some minor bugs surfaced (see PR review comments)
- Limitation: had to disable backup restore with metadata read/write injection since it surfaces too many testing issues. Will add it back later to focus on surfacing actual code/internal bugs first.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12713
Test Plan:
- Existing UT
- CI with no trivial error failure
Reviewed By: pdillinger
Differential Revision: D58326608
Pulled By: hx235
fbshipit-source-id: 011b5195aaeb6011641ae0a9194f7f2a0e325ad7
2024-06-19 15:42:00 +00:00
|
|
|
// TODO(hx235): test fault injection with MultiGet() with transactions
|
|
|
|
if (fault_fs_guard) {
|
|
|
|
fault_fs_guard->DisableThreadLocalErrorInjection(
|
|
|
|
FaultInjectionIOType::kRead);
|
|
|
|
fault_fs_guard->DisableThreadLocalErrorInjection(
|
|
|
|
FaultInjectionIOType::kMetadataRead);
|
|
|
|
}
|
2019-12-21 07:10:58 +00:00
|
|
|
WriteOptions wo;
|
Rate-limit automatic WAL flush after each user write (#9607)
Summary:
**Context:**
WAL flush is currently not rate-limited by `Options::rate_limiter`. This PR is to provide rate-limiting to auto WAL flush, the one that automatically happen after each user write operation (i.e, `Options::manual_wal_flush == false`), by adding `WriteOptions::rate_limiter_options`.
Note that we are NOT rate-limiting WAL flush that do NOT automatically happen after each user write, such as `Options::manual_wal_flush == true + manual FlushWAL()` (rate-limiting multiple WAL flushes), for the benefits of:
- being consistent with [ReadOptions::rate_limiter_priority](https://github.com/facebook/rocksdb/blob/7.0.fb/include/rocksdb/options.h#L515)
- being able to turn off some WAL flush's rate-limiting but not all (e.g, turn off specific the WAL flush of a critical user write like a service's heartbeat)
`WriteOptions::rate_limiter_options` only accept `Env::IO_USER` and `Env::IO_TOTAL` currently due to an implementation constraint.
- The constraint is that we currently queue parallel writes (including WAL writes) based on FIFO policy which does not factor rate limiter priority into this layer's scheduling. If we allow lower priorities such as `Env::IO_HIGH/MID/LOW` and such writes specified with lower priorities occurs before ones specified with higher priorities (even just by a tiny bit in arrival time), the former would have blocked the latter, leading to a "priority inversion" issue and contradictory to what we promise for rate-limiting priority. Therefore we only allow `Env::IO_USER` and `Env::IO_TOTAL` right now before improving that scheduling.
A pre-requisite to this feature is to support operation-level rate limiting in `WritableFileWriter`, which is also included in this PR.
**Summary:**
- Renamed test suite `DBRateLimiterTest to DBRateLimiterOnReadTest` for adding a new test suite
- Accept `rate_limiter_priority` in `WritableFileWriter`'s private and public write functions
- Passed `WriteOptions::rate_limiter_options` to `WritableFileWriter` in the path of automatic WAL flush.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9607
Test Plan:
- Added new unit test to verify existing flush/compaction rate-limiting does not break, since `DBTest, RateLimitingTest` is disabled and current db-level rate-limiting tests focus on read only (e.g, `db_rate_limiter_test`, `DBTest2, RateLimitedCompactionReads`).
- Added new unit test `DBRateLimiterOnWriteWALTest, AutoWalFlush`
- `strace -ftt -e trace=write ./db_bench -benchmarks=fillseq -db=/dev/shm/testdb -rate_limit_auto_wal_flush=1 -rate_limiter_bytes_per_sec=15 -rate_limiter_refill_period_us=1000000 -write_buffer_size=100000000 -disable_auto_compactions=1 -num=100`
- verified that WAL flush(i.e, system-call _write_) were chunked into 15 bytes and each _write_ was roughly 1 second apart
- verified the chunking disappeared when `-rate_limit_auto_wal_flush=0`
- crash test: `python3 tools/db_crashtest.py blackbox --disable_wal=0 --rate_limit_auto_wal_flush=1 --rate_limiter_bytes_per_sec=10485760 --interval=10` killed as normal
**Benchmarked on flush/compaction to ensure no performance regression:**
- compaction with rate-limiting (see table 1, avg over 1280-run): pre-change: **915635 micros/op**; post-change:
**907350 micros/op (improved by 0.106%)**
```
#!/bin/bash
TEST_TMPDIR=/dev/shm/testdb
START=1
NUM_DATA_ENTRY=8
N=10
rm -f compact_bmk_output.txt compact_bmk_output_2.txt dont_care_output.txt
for i in $(eval echo "{$START..$NUM_DATA_ENTRY}")
do
NUM_RUN=$(($N*(2**($i-1))))
for j in $(eval echo "{$START..$NUM_RUN}")
do
./db_bench --benchmarks=fillrandom -db=$TEST_TMPDIR -disable_auto_compactions=1 -write_buffer_size=6710886 > dont_care_output.txt && ./db_bench --benchmarks=compact -use_existing_db=1 -db=$TEST_TMPDIR -level0_file_num_compaction_trigger=1 -rate_limiter_bytes_per_sec=100000000 | egrep 'compact'
done > compact_bmk_output.txt && awk -v NUM_RUN=$NUM_RUN '{sum+=$3;sum_sqrt+=$3^2}END{print sum/NUM_RUN, sqrt(sum_sqrt/NUM_RUN-(sum/NUM_RUN)^2)}' compact_bmk_output.txt >> compact_bmk_output_2.txt
done
```
- compaction w/o rate-limiting (see table 2, avg over 640-run): pre-change: **822197 micros/op**; post-change: **823148 micros/op (regressed by 0.12%)**
```
Same as above script, except that -rate_limiter_bytes_per_sec=0
```
- flush with rate-limiting (see table 3, avg over 320-run, run on the [patch](https://github.com/hx235/rocksdb/commit/ee5c6023a9f6533fab9afdc681568daa21da4953) to augment current db_bench ): pre-change: **745752 micros/op**; post-change: **745331 micros/op (regressed by 0.06 %)**
```
#!/bin/bash
TEST_TMPDIR=/dev/shm/testdb
START=1
NUM_DATA_ENTRY=8
N=10
rm -f flush_bmk_output.txt flush_bmk_output_2.txt
for i in $(eval echo "{$START..$NUM_DATA_ENTRY}")
do
NUM_RUN=$(($N*(2**($i-1))))
for j in $(eval echo "{$START..$NUM_RUN}")
do
./db_bench -db=$TEST_TMPDIR -write_buffer_size=1048576000 -num=1000000 -rate_limiter_bytes_per_sec=100000000 -benchmarks=fillseq,flush | egrep 'flush'
done > flush_bmk_output.txt && awk -v NUM_RUN=$NUM_RUN '{sum+=$3;sum_sqrt+=$3^2}END{print sum/NUM_RUN, sqrt(sum_sqrt/NUM_RUN-(sum/NUM_RUN)^2)}' flush_bmk_output.txt >> flush_bmk_output_2.txt
done
```
- flush w/o rate-limiting (see table 4, avg over 320-run, run on the [patch](https://github.com/hx235/rocksdb/commit/ee5c6023a9f6533fab9afdc681568daa21da4953) to augment current db_bench): pre-change: **487512 micros/op**, post-change: **485856 micors/ops (improved by 0.34%)**
```
Same as above script, except that -rate_limiter_bytes_per_sec=0
```
| table 1 - compact with rate-limiting|
#-run | (pre-change) avg micros/op | std micros/op | (post-change) avg micros/op | std micros/op | change in avg micros/op (%)
-- | -- | -- | -- | -- | --
10 | 896978 | 16046.9 | 901242 | 15670.9 | 0.475373978
20 | 893718 | 15813 | 886505 | 17544.7 | -0.8070778478
40 | 900426 | 23882.2 | 894958 | 15104.5 | -0.6072681153
80 | 906635 | 21761.5 | 903332 | 23948.3 | -0.3643141948
160 | 898632 | 21098.9 | 907583 | 21145 | 0.9960695813
3.20E+02 | 905252 | 22785.5 | 908106 | 25325.5 | 0.3152713278
6.40E+02 | 905213 | 23598.6 | 906741 | 21370.5 | 0.1688000504
**1.28E+03** | **908316** | **23533.1** | **907350** | **24626.8** | **-0.1063506533**
average over #-run | 901896.25 | 21064.9625 | 901977.125 | 20592.025 | 0.008967217682
| table 2 - compact w/o rate-limiting|
#-run | (pre-change) avg micros/op | std micros/op | (post-change) avg micros/op | std micros/op | change in avg micros/op (%)
-- | -- | -- | -- | -- | --
10 | 811211 | 26996.7 | 807586 | 28456.4 | -0.4468627768
20 | 815465 | 14803.7 | 814608 | 28719.7 | -0.105093413
40 | 809203 | 26187.1 | 797835 | 25492.1 | -1.404839082
80 | 822088 | 28765.3 | 822192 | 32840.4 | 0.01265071379
160 | 821719 | 36344.7 | 821664 | 29544.9 | -0.006693285661
3.20E+02 | 820921 | 27756.4 | 821403 | 28347.7 | 0.05871454135
**6.40E+02** | **822197** | **28960.6** | **823148** | **30055.1** | **0.1156657103**
average over #-run | 8.18E+05 | 2.71E+04 | 8.15E+05 | 2.91E+04 | -0.25
| table 3 - flush with rate-limiting|
#-run | (pre-change) avg micros/op | std micros/op | (post-change) avg micros/op | std micros/op | change in avg micros/op (%)
-- | -- | -- | -- | -- | --
10 | 741721 | 11770.8 | 740345 | 5949.76 | -0.1855144994
20 | 735169 | 3561.83 | 743199 | 9755.77 | 1.09226586
40 | 743368 | 8891.03 | 742102 | 8683.22 | -0.1703059588
80 | 742129 | 8148.51 | 743417 | 9631.58| 0.1735547324
160 | 749045 | 9757.21 | 746256 | 9191.86 | -0.3723407806
**3.20E+02** | **745752** | **9819.65** | **745331** | **9840.62** | **-0.0564530836**
6.40E+02 | 749006 | 11080.5 | 748173 | 10578.7 | -0.1112140624
average over #-run | 743741.4286 | 9004.218571 | 744117.5714 | 9090.215714 | 0.05057441238
| table 4 - flush w/o rate-limiting|
#-run | (pre-change) avg micros/op | std micros/op | (post-change) avg micros/op | std micros/op | change in avg micros/op (%)
-- | -- | -- | -- | -- | --
10 | 477283 | 24719.6 | 473864 | 12379 | -0.7163464863
20 | 486743 | 20175.2 | 502296 | 23931.3 | 3.195320734
40 | 482846 | 15309.2 | 489820 | 22259.5 | 1.444352858
80 | 491490 | 21883.1 | 490071 | 23085.7 | -0.2887139108
160 | 493347 | 28074.3 | 483609 | 21211.7 | -1.973864238
**3.20E+02** | **487512** | **21401.5** | **485856** | **22195.2** | **-0.3396839462**
6.40E+02 | 490307 | 25418.6 | 485435 | 22405.2 | -0.9936631539
average over #-run | 4.87E+05 | 2.24E+04 | 4.87E+05 | 2.11E+04 | 0.00E+00
Reviewed By: ajkr
Differential Revision: D34442441
Pulled By: hx235
fbshipit-source-id: 4790f13e1e5c0a95ae1d1cc93ffcf69dc6e78bdd
2022-03-08 21:19:39 +00:00
|
|
|
if (FLAGS_rate_limit_auto_wal_flush) {
|
|
|
|
wo.rate_limiter_priority = Env::IO_USER;
|
|
|
|
}
|
2019-12-25 02:45:11 +00:00
|
|
|
Status s = NewTxn(wo, &txn);
|
|
|
|
if (!s.ok()) {
|
2023-11-03 16:53:22 +00:00
|
|
|
fprintf(stderr, "NewTxn error: %s\n", s.ToString().c_str());
|
2024-05-24 21:24:17 +00:00
|
|
|
shared->SafeTerminate();
|
2019-12-25 02:45:11 +00:00
|
|
|
}
|
2019-12-21 07:10:58 +00:00
|
|
|
}
|
2019-12-09 07:49:32 +00:00
|
|
|
for (size_t i = 0; i < num_keys; ++i) {
|
2023-05-22 19:31:52 +00:00
|
|
|
uint64_t rand_key = rand_keys[i];
|
|
|
|
key_str.emplace_back(Key(rand_key));
|
2019-12-09 07:49:32 +00:00
|
|
|
keys.emplace_back(key_str.back());
|
2019-12-25 02:45:11 +00:00
|
|
|
if (use_txn) {
|
2024-05-24 21:24:17 +00:00
|
|
|
MaybeAddKeyToTxnForRYW(thread, column_family, rand_key, txn.get(),
|
|
|
|
ryw_expected_values);
|
2019-12-21 07:10:58 +00:00
|
|
|
}
|
2019-12-09 07:49:32 +00:00
|
|
|
}
|
2019-12-21 07:10:58 +00:00
|
|
|
|
Inject more errors to more files in stress test (#12713)
Summary:
**Context:**
We currently have partial error injection:
- DB operation: all read, SST write
- DB open: all read, SST write, all metadata write.
This PR completes the error injection (with some limitations below):
- DB operation & open: all read, all write, all metadata write, all metadata read
**Summary:**
- Inject retryable metadata read, metadata write error concerning directory (e.g, dir sync, ) or file metadata (e.g, name, size, file creation/deletion...)
- Inject retryable errors to all major file types: random access file, sequential file, writable file
- Allow db stress test operations to handle above injected errors gracefully without crashing
- Change all error injection to thread-local implementation for easier disabling and enabling in the same thread. For example, we can control error handling thread to have no error injection. It's also cleaner in code.
- Limitation: compared to before, we now don't have write fault injection for backup/restore CopyOrCreateFiles work threads since they use anonymous background threads as well as read injection for db open bg thread
- Add a new flag to test error recovery without error injection so we can test the path where error recovery actually succeeds
- Some Refactory & fix to db stress test framework (see PR review comments)
- Fix some minor bugs surfaced (see PR review comments)
- Limitation: had to disable backup restore with metadata read/write injection since it surfaces too many testing issues. Will add it back later to focus on surfacing actual code/internal bugs first.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12713
Test Plan:
- Existing UT
- CI with no trivial error failure
Reviewed By: pdillinger
Differential Revision: D58326608
Pulled By: hx235
fbshipit-source-id: 011b5195aaeb6011641ae0a9194f7f2a0e325ad7
2024-06-19 15:42:00 +00:00
|
|
|
int injected_error_count = 0;
|
|
|
|
|
2019-12-25 02:45:11 +00:00
|
|
|
if (!use_txn) {
|
2020-04-11 00:18:56 +00:00
|
|
|
if (fault_fs_guard) {
|
Inject more errors to more files in stress test (#12713)
Summary:
**Context:**
We currently have partial error injection:
- DB operation: all read, SST write
- DB open: all read, SST write, all metadata write.
This PR completes the error injection (with some limitations below):
- DB operation & open: all read, all write, all metadata write, all metadata read
**Summary:**
- Inject retryable metadata read, metadata write error concerning directory (e.g, dir sync, ) or file metadata (e.g, name, size, file creation/deletion...)
- Inject retryable errors to all major file types: random access file, sequential file, writable file
- Allow db stress test operations to handle above injected errors gracefully without crashing
- Change all error injection to thread-local implementation for easier disabling and enabling in the same thread. For example, we can control error handling thread to have no error injection. It's also cleaner in code.
- Limitation: compared to before, we now don't have write fault injection for backup/restore CopyOrCreateFiles work threads since they use anonymous background threads as well as read injection for db open bg thread
- Add a new flag to test error recovery without error injection so we can test the path where error recovery actually succeeds
- Some Refactory & fix to db stress test framework (see PR review comments)
- Fix some minor bugs surfaced (see PR review comments)
- Limitation: had to disable backup restore with metadata read/write injection since it surfaces too many testing issues. Will add it back later to focus on surfacing actual code/internal bugs first.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12713
Test Plan:
- Existing UT
- CI with no trivial error failure
Reviewed By: pdillinger
Differential Revision: D58326608
Pulled By: hx235
fbshipit-source-id: 011b5195aaeb6011641ae0a9194f7f2a0e325ad7
2024-06-19 15:42:00 +00:00
|
|
|
fault_fs_guard->GetAndResetInjectedThreadLocalErrorCount(
|
|
|
|
FaultInjectionIOType::kRead);
|
|
|
|
fault_fs_guard->GetAndResetInjectedThreadLocalErrorCount(
|
|
|
|
FaultInjectionIOType::kMetadataRead);
|
2024-06-22 03:11:57 +00:00
|
|
|
SharedState::ignore_read_error = false;
|
2020-04-11 00:18:56 +00:00
|
|
|
}
|
2020-05-19 19:41:43 +00:00
|
|
|
db_->MultiGet(readoptionscopy, cfh, num_keys, keys.data(), values.data(),
|
2019-12-21 07:10:58 +00:00
|
|
|
statuses.data());
|
2024-06-24 04:54:27 +00:00
|
|
|
if (fault_fs_guard) {
|
Inject more errors to more files in stress test (#12713)
Summary:
**Context:**
We currently have partial error injection:
- DB operation: all read, SST write
- DB open: all read, SST write, all metadata write.
This PR completes the error injection (with some limitations below):
- DB operation & open: all read, all write, all metadata write, all metadata read
**Summary:**
- Inject retryable metadata read, metadata write error concerning directory (e.g, dir sync, ) or file metadata (e.g, name, size, file creation/deletion...)
- Inject retryable errors to all major file types: random access file, sequential file, writable file
- Allow db stress test operations to handle above injected errors gracefully without crashing
- Change all error injection to thread-local implementation for easier disabling and enabling in the same thread. For example, we can control error handling thread to have no error injection. It's also cleaner in code.
- Limitation: compared to before, we now don't have write fault injection for backup/restore CopyOrCreateFiles work threads since they use anonymous background threads as well as read injection for db open bg thread
- Add a new flag to test error recovery without error injection so we can test the path where error recovery actually succeeds
- Some Refactory & fix to db stress test framework (see PR review comments)
- Fix some minor bugs surfaced (see PR review comments)
- Limitation: had to disable backup restore with metadata read/write injection since it surfaces too many testing issues. Will add it back later to focus on surfacing actual code/internal bugs first.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12713
Test Plan:
- Existing UT
- CI with no trivial error failure
Reviewed By: pdillinger
Differential Revision: D58326608
Pulled By: hx235
fbshipit-source-id: 011b5195aaeb6011641ae0a9194f7f2a0e325ad7
2024-06-19 15:42:00 +00:00
|
|
|
injected_error_count = GetMinInjectedErrorCount(
|
|
|
|
fault_fs_guard->GetAndResetInjectedThreadLocalErrorCount(
|
|
|
|
FaultInjectionIOType::kRead),
|
|
|
|
fault_fs_guard->GetAndResetInjectedThreadLocalErrorCount(
|
|
|
|
FaultInjectionIOType::kMetadataRead));
|
|
|
|
|
|
|
|
if (injected_error_count > 0) {
|
|
|
|
int stat_nok_nfound = 0;
|
|
|
|
for (const auto& s : statuses) {
|
|
|
|
if (!s.ok() && !s.IsNotFound()) {
|
|
|
|
stat_nok_nfound++;
|
|
|
|
}
|
|
|
|
}
|
2024-06-24 04:54:27 +00:00
|
|
|
if (!SharedState::ignore_read_error &&
|
|
|
|
stat_nok_nfound < injected_error_count) {
|
Inject more errors to more files in stress test (#12713)
Summary:
**Context:**
We currently have partial error injection:
- DB operation: all read, SST write
- DB open: all read, SST write, all metadata write.
This PR completes the error injection (with some limitations below):
- DB operation & open: all read, all write, all metadata write, all metadata read
**Summary:**
- Inject retryable metadata read, metadata write error concerning directory (e.g, dir sync, ) or file metadata (e.g, name, size, file creation/deletion...)
- Inject retryable errors to all major file types: random access file, sequential file, writable file
- Allow db stress test operations to handle above injected errors gracefully without crashing
- Change all error injection to thread-local implementation for easier disabling and enabling in the same thread. For example, we can control error handling thread to have no error injection. It's also cleaner in code.
- Limitation: compared to before, we now don't have write fault injection for backup/restore CopyOrCreateFiles work threads since they use anonymous background threads as well as read injection for db open bg thread
- Add a new flag to test error recovery without error injection so we can test the path where error recovery actually succeeds
- Some Refactory & fix to db stress test framework (see PR review comments)
- Fix some minor bugs surfaced (see PR review comments)
- Limitation: had to disable backup restore with metadata read/write injection since it surfaces too many testing issues. Will add it back later to focus on surfacing actual code/internal bugs first.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12713
Test Plan:
- Existing UT
- CI with no trivial error failure
Reviewed By: pdillinger
Differential Revision: D58326608
Pulled By: hx235
fbshipit-source-id: 011b5195aaeb6011641ae0a9194f7f2a0e325ad7
2024-06-19 15:42:00 +00:00
|
|
|
// Grab mutex so multiple thread don't try to print the
|
|
|
|
// stack trace at the same time
|
|
|
|
MutexLock l(shared->GetMutex());
|
|
|
|
fprintf(stderr, "Didn't get expected error from MultiGet. \n");
|
|
|
|
fprintf(stderr,
|
|
|
|
"num_keys %zu Expected %d errors, seen at least %d\n",
|
|
|
|
num_keys, injected_error_count, stat_nok_nfound);
|
|
|
|
fprintf(stderr, "Callstack that injected the fault\n");
|
|
|
|
fault_fs_guard->PrintInjectedThreadLocalErrorBacktrace(
|
|
|
|
FaultInjectionIOType::kRead);
|
|
|
|
fault_fs_guard->PrintInjectedThreadLocalErrorBacktrace(
|
|
|
|
FaultInjectionIOType::kMetadataRead);
|
|
|
|
std::terminate();
|
|
|
|
}
|
|
|
|
}
|
2020-04-11 00:18:56 +00:00
|
|
|
}
|
2019-12-21 07:10:58 +00:00
|
|
|
} else {
|
2023-05-22 19:31:52 +00:00
|
|
|
assert(txn);
|
2020-05-19 19:41:43 +00:00
|
|
|
txn->MultiGet(readoptionscopy, cfh, num_keys, keys.data(), values.data(),
|
2019-12-21 07:10:58 +00:00
|
|
|
statuses.data());
|
|
|
|
}
|
|
|
|
|
2023-05-22 19:31:52 +00:00
|
|
|
auto ryw_check =
|
|
|
|
[](const Slice& key, const PinnableSlice& value, const Status& s,
|
|
|
|
const std::optional<ExpectedValue>& ryw_expected_value) -> bool {
|
|
|
|
if (!ryw_expected_value.has_value()) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
const ExpectedValue& expected = ryw_expected_value.value();
|
|
|
|
char expected_value[100];
|
|
|
|
if (s.ok() &&
|
|
|
|
ExpectedValueHelper::MustHaveNotExisted(expected, expected)) {
|
|
|
|
fprintf(stderr,
|
|
|
|
"MultiGet returned value different from what was "
|
|
|
|
"written for key %s\n",
|
|
|
|
key.ToString(true).c_str());
|
|
|
|
fprintf(stderr,
|
|
|
|
"MultiGet returned ok, transaction has non-committed "
|
|
|
|
"delete.\n");
|
|
|
|
return false;
|
|
|
|
} else if (s.IsNotFound() &&
|
|
|
|
ExpectedValueHelper::MustHaveExisted(expected, expected)) {
|
|
|
|
fprintf(stderr,
|
|
|
|
"MultiGet returned value different from what was "
|
|
|
|
"written for key %s\n",
|
|
|
|
key.ToString(true).c_str());
|
|
|
|
fprintf(stderr,
|
|
|
|
"MultiGet returned not found, transaction has "
|
|
|
|
"non-committed value.\n");
|
|
|
|
return false;
|
|
|
|
} else if (s.ok() &&
|
|
|
|
ExpectedValueHelper::MustHaveExisted(expected, expected)) {
|
|
|
|
Slice from_txn_slice(value);
|
|
|
|
size_t sz = GenerateValue(expected.GetValueBase(), expected_value,
|
|
|
|
sizeof(expected_value));
|
|
|
|
Slice expected_value_slice(expected_value, sz);
|
|
|
|
if (expected_value_slice.compare(from_txn_slice) == 0) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
fprintf(stderr,
|
|
|
|
"MultiGet returned value different from what was "
|
|
|
|
"written for key %s\n",
|
|
|
|
key.ToString(true /* hex */).c_str());
|
|
|
|
fprintf(stderr, "MultiGet returned value %s\n",
|
|
|
|
from_txn_slice.ToString(true /* hex */).c_str());
|
|
|
|
fprintf(stderr, "Transaction has non-committed value %s\n",
|
|
|
|
expected_value_slice.ToString(true /* hex */).c_str());
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
};
|
|
|
|
|
|
|
|
auto check_multiget =
|
|
|
|
[&](const Slice& key, const PinnableSlice& expected_value,
|
|
|
|
const Status& s,
|
|
|
|
const std::optional<ExpectedValue>& ryw_expected_value) -> bool {
|
Inject more errors to more files in stress test (#12713)
Summary:
**Context:**
We currently have partial error injection:
- DB operation: all read, SST write
- DB open: all read, SST write, all metadata write.
This PR completes the error injection (with some limitations below):
- DB operation & open: all read, all write, all metadata write, all metadata read
**Summary:**
- Inject retryable metadata read, metadata write error concerning directory (e.g, dir sync, ) or file metadata (e.g, name, size, file creation/deletion...)
- Inject retryable errors to all major file types: random access file, sequential file, writable file
- Allow db stress test operations to handle above injected errors gracefully without crashing
- Change all error injection to thread-local implementation for easier disabling and enabling in the same thread. For example, we can control error handling thread to have no error injection. It's also cleaner in code.
- Limitation: compared to before, we now don't have write fault injection for backup/restore CopyOrCreateFiles work threads since they use anonymous background threads as well as read injection for db open bg thread
- Add a new flag to test error recovery without error injection so we can test the path where error recovery actually succeeds
- Some Refactory & fix to db stress test framework (see PR review comments)
- Fix some minor bugs surfaced (see PR review comments)
- Limitation: had to disable backup restore with metadata read/write injection since it surfaces too many testing issues. Will add it back later to focus on surfacing actual code/internal bugs first.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12713
Test Plan:
- Existing UT
- CI with no trivial error failure
Reviewed By: pdillinger
Differential Revision: D58326608
Pulled By: hx235
fbshipit-source-id: 011b5195aaeb6011641ae0a9194f7f2a0e325ad7
2024-06-19 15:42:00 +00:00
|
|
|
// Temporarily disable error injection for verification
|
|
|
|
if (fault_fs_guard) {
|
|
|
|
fault_fs_guard->DisableThreadLocalErrorInjection(
|
|
|
|
FaultInjectionIOType::kRead);
|
|
|
|
fault_fs_guard->DisableThreadLocalErrorInjection(
|
|
|
|
FaultInjectionIOType::kMetadataRead);
|
|
|
|
}
|
|
|
|
|
|
|
|
bool check_multiget_res = true;
|
2020-05-19 19:41:43 +00:00
|
|
|
bool is_consistent = true;
|
2023-05-22 19:31:52 +00:00
|
|
|
bool is_ryw_correct = true;
|
Inject more errors to more files in stress test (#12713)
Summary:
**Context:**
We currently have partial error injection:
- DB operation: all read, SST write
- DB open: all read, SST write, all metadata write.
This PR completes the error injection (with some limitations below):
- DB operation & open: all read, all write, all metadata write, all metadata read
**Summary:**
- Inject retryable metadata read, metadata write error concerning directory (e.g, dir sync, ) or file metadata (e.g, name, size, file creation/deletion...)
- Inject retryable errors to all major file types: random access file, sequential file, writable file
- Allow db stress test operations to handle above injected errors gracefully without crashing
- Change all error injection to thread-local implementation for easier disabling and enabling in the same thread. For example, we can control error handling thread to have no error injection. It's also cleaner in code.
- Limitation: compared to before, we now don't have write fault injection for backup/restore CopyOrCreateFiles work threads since they use anonymous background threads as well as read injection for db open bg thread
- Add a new flag to test error recovery without error injection so we can test the path where error recovery actually succeeds
- Some Refactory & fix to db stress test framework (see PR review comments)
- Fix some minor bugs surfaced (see PR review comments)
- Limitation: had to disable backup restore with metadata read/write injection since it surfaces too many testing issues. Will add it back later to focus on surfacing actual code/internal bugs first.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12713
Test Plan:
- Existing UT
- CI with no trivial error failure
Reviewed By: pdillinger
Differential Revision: D58326608
Pulled By: hx235
fbshipit-source-id: 011b5195aaeb6011641ae0a9194f7f2a0e325ad7
2024-06-19 15:42:00 +00:00
|
|
|
|
|
|
|
// If test does not use transaction, the consistency check for each key
|
|
|
|
// included check results from db `Get` and db `MultiGet` are consistent.
|
2023-05-22 19:31:52 +00:00
|
|
|
// If test use transaction, after consistency check, also do a read your
|
|
|
|
// own write check.
|
Inject more errors to more files in stress test (#12713)
Summary:
**Context:**
We currently have partial error injection:
- DB operation: all read, SST write
- DB open: all read, SST write, all metadata write.
This PR completes the error injection (with some limitations below):
- DB operation & open: all read, all write, all metadata write, all metadata read
**Summary:**
- Inject retryable metadata read, metadata write error concerning directory (e.g, dir sync, ) or file metadata (e.g, name, size, file creation/deletion...)
- Inject retryable errors to all major file types: random access file, sequential file, writable file
- Allow db stress test operations to handle above injected errors gracefully without crashing
- Change all error injection to thread-local implementation for easier disabling and enabling in the same thread. For example, we can control error handling thread to have no error injection. It's also cleaner in code.
- Limitation: compared to before, we now don't have write fault injection for backup/restore CopyOrCreateFiles work threads since they use anonymous background threads as well as read injection for db open bg thread
- Add a new flag to test error recovery without error injection so we can test the path where error recovery actually succeeds
- Some Refactory & fix to db stress test framework (see PR review comments)
- Fix some minor bugs surfaced (see PR review comments)
- Limitation: had to disable backup restore with metadata read/write injection since it surfaces too many testing issues. Will add it back later to focus on surfacing actual code/internal bugs first.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12713
Test Plan:
- Existing UT
- CI with no trivial error failure
Reviewed By: pdillinger
Differential Revision: D58326608
Pulled By: hx235
fbshipit-source-id: 011b5195aaeb6011641ae0a9194f7f2a0e325ad7
2024-06-19 15:42:00 +00:00
|
|
|
Status tmp_s;
|
|
|
|
std::string value;
|
2020-05-19 19:41:43 +00:00
|
|
|
|
Inject more errors to more files in stress test (#12713)
Summary:
**Context:**
We currently have partial error injection:
- DB operation: all read, SST write
- DB open: all read, SST write, all metadata write.
This PR completes the error injection (with some limitations below):
- DB operation & open: all read, all write, all metadata write, all metadata read
**Summary:**
- Inject retryable metadata read, metadata write error concerning directory (e.g, dir sync, ) or file metadata (e.g, name, size, file creation/deletion...)
- Inject retryable errors to all major file types: random access file, sequential file, writable file
- Allow db stress test operations to handle above injected errors gracefully without crashing
- Change all error injection to thread-local implementation for easier disabling and enabling in the same thread. For example, we can control error handling thread to have no error injection. It's also cleaner in code.
- Limitation: compared to before, we now don't have write fault injection for backup/restore CopyOrCreateFiles work threads since they use anonymous background threads as well as read injection for db open bg thread
- Add a new flag to test error recovery without error injection so we can test the path where error recovery actually succeeds
- Some Refactory & fix to db stress test framework (see PR review comments)
- Fix some minor bugs surfaced (see PR review comments)
- Limitation: had to disable backup restore with metadata read/write injection since it surfaces too many testing issues. Will add it back later to focus on surfacing actual code/internal bugs first.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12713
Test Plan:
- Existing UT
- CI with no trivial error failure
Reviewed By: pdillinger
Differential Revision: D58326608
Pulled By: hx235
fbshipit-source-id: 011b5195aaeb6011641ae0a9194f7f2a0e325ad7
2024-06-19 15:42:00 +00:00
|
|
|
if (use_txn) {
|
|
|
|
assert(txn);
|
|
|
|
ThreadStatusUtil::SetThreadOperation(
|
|
|
|
ThreadStatus::OperationType::OP_GET);
|
|
|
|
tmp_s = txn->Get(readoptionscopy, cfh, key, &value);
|
|
|
|
ThreadStatusUtil::SetThreadOperation(
|
|
|
|
ThreadStatus::OperationType::OP_MULTIGET);
|
|
|
|
} else {
|
|
|
|
ThreadStatusUtil::SetThreadOperation(
|
|
|
|
ThreadStatus::OperationType::OP_GET);
|
|
|
|
tmp_s = db_->Get(readoptionscopy, cfh, key, &value);
|
|
|
|
ThreadStatusUtil::SetThreadOperation(
|
|
|
|
ThreadStatus::OperationType::OP_MULTIGET);
|
|
|
|
}
|
|
|
|
if (!tmp_s.ok() && !tmp_s.IsNotFound()) {
|
|
|
|
fprintf(stderr, "Get error: %s\n", s.ToString().c_str());
|
|
|
|
is_consistent = false;
|
|
|
|
} else if (!s.ok() && tmp_s.ok()) {
|
|
|
|
fprintf(stderr,
|
|
|
|
"MultiGet(%d) returned different results with key %s. "
|
|
|
|
"Snapshot Seq No: %" PRIu64 "\n",
|
|
|
|
column_family, key.ToString(true).c_str(),
|
|
|
|
readoptionscopy.snapshot->GetSequenceNumber());
|
|
|
|
fprintf(stderr, "Get returned ok, MultiGet returned not found\n");
|
|
|
|
is_consistent = false;
|
|
|
|
} else if (s.ok() && tmp_s.IsNotFound()) {
|
|
|
|
fprintf(stderr,
|
|
|
|
"MultiGet(%d) returned different results with key %s. "
|
|
|
|
"Snapshot Seq No: %" PRIu64 "\n",
|
|
|
|
column_family, key.ToString(true).c_str(),
|
|
|
|
readoptionscopy.snapshot->GetSequenceNumber());
|
|
|
|
fprintf(stderr, "MultiGet returned ok, Get returned not found\n");
|
|
|
|
is_consistent = false;
|
|
|
|
} else if (s.ok() && value != expected_value.ToString()) {
|
|
|
|
fprintf(stderr,
|
|
|
|
"MultiGet(%d) returned different results with key %s. "
|
|
|
|
"Snapshot Seq No: %" PRIu64 "\n",
|
|
|
|
column_family, key.ToString(true).c_str(),
|
|
|
|
readoptionscopy.snapshot->GetSequenceNumber());
|
|
|
|
fprintf(stderr, "MultiGet returned value %s\n",
|
|
|
|
expected_value.ToString(true).c_str());
|
|
|
|
fprintf(stderr, "Get returned value %s\n",
|
|
|
|
Slice(value).ToString(true /* hex */).c_str());
|
|
|
|
is_consistent = false;
|
2020-05-19 19:41:43 +00:00
|
|
|
}
|
|
|
|
|
2023-05-22 19:31:52 +00:00
|
|
|
// If test uses transaction, continue to do a read your own write check.
|
|
|
|
if (is_consistent && use_txn) {
|
|
|
|
is_ryw_correct = ryw_check(key, expected_value, s, ryw_expected_value);
|
|
|
|
}
|
|
|
|
|
2020-05-19 19:41:43 +00:00
|
|
|
if (!is_consistent) {
|
|
|
|
fprintf(stderr, "TestMultiGet error: is_consistent is false\n");
|
|
|
|
thread->stats.AddErrors(1);
|
Inject more errors to more files in stress test (#12713)
Summary:
**Context:**
We currently have partial error injection:
- DB operation: all read, SST write
- DB open: all read, SST write, all metadata write.
This PR completes the error injection (with some limitations below):
- DB operation & open: all read, all write, all metadata write, all metadata read
**Summary:**
- Inject retryable metadata read, metadata write error concerning directory (e.g, dir sync, ) or file metadata (e.g, name, size, file creation/deletion...)
- Inject retryable errors to all major file types: random access file, sequential file, writable file
- Allow db stress test operations to handle above injected errors gracefully without crashing
- Change all error injection to thread-local implementation for easier disabling and enabling in the same thread. For example, we can control error handling thread to have no error injection. It's also cleaner in code.
- Limitation: compared to before, we now don't have write fault injection for backup/restore CopyOrCreateFiles work threads since they use anonymous background threads as well as read injection for db open bg thread
- Add a new flag to test error recovery without error injection so we can test the path where error recovery actually succeeds
- Some Refactory & fix to db stress test framework (see PR review comments)
- Fix some minor bugs surfaced (see PR review comments)
- Limitation: had to disable backup restore with metadata read/write injection since it surfaces too many testing issues. Will add it back later to focus on surfacing actual code/internal bugs first.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12713
Test Plan:
- Existing UT
- CI with no trivial error failure
Reviewed By: pdillinger
Differential Revision: D58326608
Pulled By: hx235
fbshipit-source-id: 011b5195aaeb6011641ae0a9194f7f2a0e325ad7
2024-06-19 15:42:00 +00:00
|
|
|
check_multiget_res = false;
|
2020-05-19 19:41:43 +00:00
|
|
|
// Fail fast to preserve the DB state
|
2024-05-24 21:24:17 +00:00
|
|
|
shared->SetVerificationFailure();
|
2023-05-22 19:31:52 +00:00
|
|
|
} else if (!is_ryw_correct) {
|
|
|
|
fprintf(stderr, "TestMultiGet error: is_ryw_correct is false\n");
|
|
|
|
thread->stats.AddErrors(1);
|
Inject more errors to more files in stress test (#12713)
Summary:
**Context:**
We currently have partial error injection:
- DB operation: all read, SST write
- DB open: all read, SST write, all metadata write.
This PR completes the error injection (with some limitations below):
- DB operation & open: all read, all write, all metadata write, all metadata read
**Summary:**
- Inject retryable metadata read, metadata write error concerning directory (e.g, dir sync, ) or file metadata (e.g, name, size, file creation/deletion...)
- Inject retryable errors to all major file types: random access file, sequential file, writable file
- Allow db stress test operations to handle above injected errors gracefully without crashing
- Change all error injection to thread-local implementation for easier disabling and enabling in the same thread. For example, we can control error handling thread to have no error injection. It's also cleaner in code.
- Limitation: compared to before, we now don't have write fault injection for backup/restore CopyOrCreateFiles work threads since they use anonymous background threads as well as read injection for db open bg thread
- Add a new flag to test error recovery without error injection so we can test the path where error recovery actually succeeds
- Some Refactory & fix to db stress test framework (see PR review comments)
- Fix some minor bugs surfaced (see PR review comments)
- Limitation: had to disable backup restore with metadata read/write injection since it surfaces too many testing issues. Will add it back later to focus on surfacing actual code/internal bugs first.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12713
Test Plan:
- Existing UT
- CI with no trivial error failure
Reviewed By: pdillinger
Differential Revision: D58326608
Pulled By: hx235
fbshipit-source-id: 011b5195aaeb6011641ae0a9194f7f2a0e325ad7
2024-06-19 15:42:00 +00:00
|
|
|
check_multiget_res = false;
|
2023-05-22 19:31:52 +00:00
|
|
|
// Fail fast to preserve the DB state
|
2024-05-24 21:24:17 +00:00
|
|
|
shared->SetVerificationFailure();
|
2020-05-19 19:41:43 +00:00
|
|
|
} else if (s.ok()) {
|
2020-04-14 18:04:39 +00:00
|
|
|
// found case
|
|
|
|
thread->stats.AddGets(1, 1);
|
2019-12-09 07:49:32 +00:00
|
|
|
} else if (s.IsNotFound()) {
|
|
|
|
// not found case
|
|
|
|
thread->stats.AddGets(1, 0);
|
2020-01-03 21:05:21 +00:00
|
|
|
} else if (s.IsMergeInProgress() && use_txn) {
|
|
|
|
// With txn this is sometimes expected.
|
|
|
|
thread->stats.AddGets(1, 1);
|
2024-06-25 03:51:39 +00:00
|
|
|
} else if (injected_error_count == 0 || !IsErrorInjectedAndRetryable(s)) {
|
|
|
|
fprintf(stderr, "MultiGet error: %s\n", s.ToString().c_str());
|
|
|
|
thread->stats.AddErrors(1);
|
|
|
|
shared->SetVerificationFailure();
|
2019-12-09 07:49:32 +00:00
|
|
|
}
|
Inject more errors to more files in stress test (#12713)
Summary:
**Context:**
We currently have partial error injection:
- DB operation: all read, SST write
- DB open: all read, SST write, all metadata write.
This PR completes the error injection (with some limitations below):
- DB operation & open: all read, all write, all metadata write, all metadata read
**Summary:**
- Inject retryable metadata read, metadata write error concerning directory (e.g, dir sync, ) or file metadata (e.g, name, size, file creation/deletion...)
- Inject retryable errors to all major file types: random access file, sequential file, writable file
- Allow db stress test operations to handle above injected errors gracefully without crashing
- Change all error injection to thread-local implementation for easier disabling and enabling in the same thread. For example, we can control error handling thread to have no error injection. It's also cleaner in code.
- Limitation: compared to before, we now don't have write fault injection for backup/restore CopyOrCreateFiles work threads since they use anonymous background threads as well as read injection for db open bg thread
- Add a new flag to test error recovery without error injection so we can test the path where error recovery actually succeeds
- Some Refactory & fix to db stress test framework (see PR review comments)
- Fix some minor bugs surfaced (see PR review comments)
- Limitation: had to disable backup restore with metadata read/write injection since it surfaces too many testing issues. Will add it back later to focus on surfacing actual code/internal bugs first.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12713
Test Plan:
- Existing UT
- CI with no trivial error failure
Reviewed By: pdillinger
Differential Revision: D58326608
Pulled By: hx235
fbshipit-source-id: 011b5195aaeb6011641ae0a9194f7f2a0e325ad7
2024-06-19 15:42:00 +00:00
|
|
|
|
|
|
|
// Enable back error injection disbled for checking results
|
|
|
|
if (fault_fs_guard) {
|
|
|
|
fault_fs_guard->DisableThreadLocalErrorInjection(
|
|
|
|
FaultInjectionIOType::kRead);
|
|
|
|
fault_fs_guard->DisableThreadLocalErrorInjection(
|
|
|
|
FaultInjectionIOType::kMetadataRead);
|
|
|
|
}
|
|
|
|
return check_multiget_res;
|
2023-05-22 19:31:52 +00:00
|
|
|
};
|
|
|
|
|
Inject more errors to more files in stress test (#12713)
Summary:
**Context:**
We currently have partial error injection:
- DB operation: all read, SST write
- DB open: all read, SST write, all metadata write.
This PR completes the error injection (with some limitations below):
- DB operation & open: all read, all write, all metadata write, all metadata read
**Summary:**
- Inject retryable metadata read, metadata write error concerning directory (e.g, dir sync, ) or file metadata (e.g, name, size, file creation/deletion...)
- Inject retryable errors to all major file types: random access file, sequential file, writable file
- Allow db stress test operations to handle above injected errors gracefully without crashing
- Change all error injection to thread-local implementation for easier disabling and enabling in the same thread. For example, we can control error handling thread to have no error injection. It's also cleaner in code.
- Limitation: compared to before, we now don't have write fault injection for backup/restore CopyOrCreateFiles work threads since they use anonymous background threads as well as read injection for db open bg thread
- Add a new flag to test error recovery without error injection so we can test the path where error recovery actually succeeds
- Some Refactory & fix to db stress test framework (see PR review comments)
- Fix some minor bugs surfaced (see PR review comments)
- Limitation: had to disable backup restore with metadata read/write injection since it surfaces too many testing issues. Will add it back later to focus on surfacing actual code/internal bugs first.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12713
Test Plan:
- Existing UT
- CI with no trivial error failure
Reviewed By: pdillinger
Differential Revision: D58326608
Pulled By: hx235
fbshipit-source-id: 011b5195aaeb6011641ae0a9194f7f2a0e325ad7
2024-06-19 15:42:00 +00:00
|
|
|
// Consistency check
|
|
|
|
if (do_consistency_check && injected_error_count == 0) {
|
|
|
|
size_t num_of_keys = keys.size();
|
|
|
|
assert(values.size() == num_of_keys);
|
|
|
|
assert(statuses.size() == num_of_keys);
|
|
|
|
for (size_t i = 0; i < num_of_keys; ++i) {
|
|
|
|
bool check_result = true;
|
|
|
|
if (use_txn) {
|
|
|
|
std::optional<ExpectedValue> ryw_expected_value;
|
2024-05-23 23:47:39 +00:00
|
|
|
|
Inject more errors to more files in stress test (#12713)
Summary:
**Context:**
We currently have partial error injection:
- DB operation: all read, SST write
- DB open: all read, SST write, all metadata write.
This PR completes the error injection (with some limitations below):
- DB operation & open: all read, all write, all metadata write, all metadata read
**Summary:**
- Inject retryable metadata read, metadata write error concerning directory (e.g, dir sync, ) or file metadata (e.g, name, size, file creation/deletion...)
- Inject retryable errors to all major file types: random access file, sequential file, writable file
- Allow db stress test operations to handle above injected errors gracefully without crashing
- Change all error injection to thread-local implementation for easier disabling and enabling in the same thread. For example, we can control error handling thread to have no error injection. It's also cleaner in code.
- Limitation: compared to before, we now don't have write fault injection for backup/restore CopyOrCreateFiles work threads since they use anonymous background threads as well as read injection for db open bg thread
- Add a new flag to test error recovery without error injection so we can test the path where error recovery actually succeeds
- Some Refactory & fix to db stress test framework (see PR review comments)
- Fix some minor bugs surfaced (see PR review comments)
- Limitation: had to disable backup restore with metadata read/write injection since it surfaces too many testing issues. Will add it back later to focus on surfacing actual code/internal bugs first.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12713
Test Plan:
- Existing UT
- CI with no trivial error failure
Reviewed By: pdillinger
Differential Revision: D58326608
Pulled By: hx235
fbshipit-source-id: 011b5195aaeb6011641ae0a9194f7f2a0e325ad7
2024-06-19 15:42:00 +00:00
|
|
|
const auto it = ryw_expected_values.find(key_str[i]);
|
|
|
|
if (it != ryw_expected_values.end()) {
|
|
|
|
ryw_expected_value = it->second;
|
|
|
|
}
|
2024-05-23 23:47:39 +00:00
|
|
|
|
Inject more errors to more files in stress test (#12713)
Summary:
**Context:**
We currently have partial error injection:
- DB operation: all read, SST write
- DB open: all read, SST write, all metadata write.
This PR completes the error injection (with some limitations below):
- DB operation & open: all read, all write, all metadata write, all metadata read
**Summary:**
- Inject retryable metadata read, metadata write error concerning directory (e.g, dir sync, ) or file metadata (e.g, name, size, file creation/deletion...)
- Inject retryable errors to all major file types: random access file, sequential file, writable file
- Allow db stress test operations to handle above injected errors gracefully without crashing
- Change all error injection to thread-local implementation for easier disabling and enabling in the same thread. For example, we can control error handling thread to have no error injection. It's also cleaner in code.
- Limitation: compared to before, we now don't have write fault injection for backup/restore CopyOrCreateFiles work threads since they use anonymous background threads as well as read injection for db open bg thread
- Add a new flag to test error recovery without error injection so we can test the path where error recovery actually succeeds
- Some Refactory & fix to db stress test framework (see PR review comments)
- Fix some minor bugs surfaced (see PR review comments)
- Limitation: had to disable backup restore with metadata read/write injection since it surfaces too many testing issues. Will add it back later to focus on surfacing actual code/internal bugs first.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12713
Test Plan:
- Existing UT
- CI with no trivial error failure
Reviewed By: pdillinger
Differential Revision: D58326608
Pulled By: hx235
fbshipit-source-id: 011b5195aaeb6011641ae0a9194f7f2a0e325ad7
2024-06-19 15:42:00 +00:00
|
|
|
check_result = check_multiget(keys[i], values[i], statuses[i],
|
|
|
|
ryw_expected_value);
|
|
|
|
} else {
|
|
|
|
check_result = check_multiget(keys[i], values[i], statuses[i],
|
|
|
|
std::nullopt /* ryw_expected_value */);
|
|
|
|
}
|
|
|
|
if (!check_result) {
|
|
|
|
break;
|
|
|
|
}
|
2023-05-22 19:31:52 +00:00
|
|
|
}
|
2019-12-09 07:49:32 +00:00
|
|
|
}
|
2020-05-19 19:41:43 +00:00
|
|
|
|
|
|
|
if (readoptionscopy.snapshot) {
|
|
|
|
db_->ReleaseSnapshot(readoptionscopy.snapshot);
|
2020-04-11 00:18:56 +00:00
|
|
|
}
|
2023-07-31 00:30:01 +00:00
|
|
|
if (use_txn) {
|
|
|
|
txn->Rollback().PermitUncheckedError();
|
Inject more errors to more files in stress test (#12713)
Summary:
**Context:**
We currently have partial error injection:
- DB operation: all read, SST write
- DB open: all read, SST write, all metadata write.
This PR completes the error injection (with some limitations below):
- DB operation & open: all read, all write, all metadata write, all metadata read
**Summary:**
- Inject retryable metadata read, metadata write error concerning directory (e.g, dir sync, ) or file metadata (e.g, name, size, file creation/deletion...)
- Inject retryable errors to all major file types: random access file, sequential file, writable file
- Allow db stress test operations to handle above injected errors gracefully without crashing
- Change all error injection to thread-local implementation for easier disabling and enabling in the same thread. For example, we can control error handling thread to have no error injection. It's also cleaner in code.
- Limitation: compared to before, we now don't have write fault injection for backup/restore CopyOrCreateFiles work threads since they use anonymous background threads as well as read injection for db open bg thread
- Add a new flag to test error recovery without error injection so we can test the path where error recovery actually succeeds
- Some Refactory & fix to db stress test framework (see PR review comments)
- Fix some minor bugs surfaced (see PR review comments)
- Limitation: had to disable backup restore with metadata read/write injection since it surfaces too many testing issues. Will add it back later to focus on surfacing actual code/internal bugs first.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12713
Test Plan:
- Existing UT
- CI with no trivial error failure
Reviewed By: pdillinger
Differential Revision: D58326608
Pulled By: hx235
fbshipit-source-id: 011b5195aaeb6011641ae0a9194f7f2a0e325ad7
2024-06-19 15:42:00 +00:00
|
|
|
// Enable back error injection disbled for transactions
|
|
|
|
if (fault_fs_guard) {
|
|
|
|
fault_fs_guard->EnableThreadLocalErrorInjection(
|
|
|
|
FaultInjectionIOType::kRead);
|
|
|
|
fault_fs_guard->EnableThreadLocalErrorInjection(
|
|
|
|
FaultInjectionIOType::kMetadataRead);
|
|
|
|
}
|
2023-07-31 00:30:01 +00:00
|
|
|
}
|
2019-12-09 07:49:32 +00:00
|
|
|
return statuses;
|
|
|
|
}
|
|
|
|
|
2023-03-17 21:47:29 +00:00
|
|
|
void TestGetEntity(ThreadState* thread, const ReadOptions& read_opts,
|
|
|
|
const std::vector<int>& rand_column_families,
|
|
|
|
const std::vector<int64_t>& rand_keys) override {
|
|
|
|
assert(thread);
|
|
|
|
|
|
|
|
SharedState* const shared = thread->shared;
|
|
|
|
assert(shared);
|
|
|
|
|
|
|
|
assert(!rand_column_families.empty());
|
|
|
|
|
2024-05-30 00:00:11 +00:00
|
|
|
const int column_family = rand_column_families[0];
|
|
|
|
|
|
|
|
assert(column_family >= 0);
|
|
|
|
assert(column_family < static_cast<int>(column_families_.size()));
|
|
|
|
|
|
|
|
ColumnFamilyHandle* const cfh = column_families_[column_family];
|
2023-03-17 21:47:29 +00:00
|
|
|
assert(cfh);
|
|
|
|
|
2024-05-29 17:16:37 +00:00
|
|
|
assert(!rand_keys.empty());
|
|
|
|
|
2024-05-30 00:00:11 +00:00
|
|
|
const int64_t key = rand_keys[0];
|
|
|
|
const std::string key_str = Key(key);
|
2023-03-17 21:47:29 +00:00
|
|
|
|
2024-05-09 23:40:22 +00:00
|
|
|
PinnableWideColumns columns_from_db;
|
|
|
|
PinnableAttributeGroups attribute_groups_from_db;
|
2023-03-17 21:47:29 +00:00
|
|
|
|
2024-01-11 00:35:54 +00:00
|
|
|
ReadOptions read_opts_copy = read_opts;
|
|
|
|
std::string read_ts_str;
|
|
|
|
Slice read_ts_slice;
|
|
|
|
if (FLAGS_user_timestamp_size > 0) {
|
|
|
|
read_ts_str = GetNowNanos();
|
|
|
|
read_ts_slice = read_ts_str;
|
|
|
|
read_opts_copy.timestamp = &read_ts_slice;
|
|
|
|
}
|
2024-05-30 00:00:11 +00:00
|
|
|
const bool read_older_ts = MaybeUseOlderTimestampForPointLookup(
|
2024-01-11 00:35:54 +00:00
|
|
|
thread, read_ts_str, read_ts_slice, read_opts_copy);
|
|
|
|
|
2024-05-30 00:00:11 +00:00
|
|
|
const ExpectedValue pre_read_expected_value =
|
|
|
|
thread->shared->Get(column_family, key);
|
|
|
|
|
Inject more errors to more files in stress test (#12713)
Summary:
**Context:**
We currently have partial error injection:
- DB operation: all read, SST write
- DB open: all read, SST write, all metadata write.
This PR completes the error injection (with some limitations below):
- DB operation & open: all read, all write, all metadata write, all metadata read
**Summary:**
- Inject retryable metadata read, metadata write error concerning directory (e.g, dir sync, ) or file metadata (e.g, name, size, file creation/deletion...)
- Inject retryable errors to all major file types: random access file, sequential file, writable file
- Allow db stress test operations to handle above injected errors gracefully without crashing
- Change all error injection to thread-local implementation for easier disabling and enabling in the same thread. For example, we can control error handling thread to have no error injection. It's also cleaner in code.
- Limitation: compared to before, we now don't have write fault injection for backup/restore CopyOrCreateFiles work threads since they use anonymous background threads as well as read injection for db open bg thread
- Add a new flag to test error recovery without error injection so we can test the path where error recovery actually succeeds
- Some Refactory & fix to db stress test framework (see PR review comments)
- Fix some minor bugs surfaced (see PR review comments)
- Limitation: had to disable backup restore with metadata read/write injection since it surfaces too many testing issues. Will add it back later to focus on surfacing actual code/internal bugs first.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12713
Test Plan:
- Existing UT
- CI with no trivial error failure
Reviewed By: pdillinger
Differential Revision: D58326608
Pulled By: hx235
fbshipit-source-id: 011b5195aaeb6011641ae0a9194f7f2a0e325ad7
2024-06-19 15:42:00 +00:00
|
|
|
if (fault_fs_guard) {
|
|
|
|
fault_fs_guard->GetAndResetInjectedThreadLocalErrorCount(
|
|
|
|
FaultInjectionIOType::kRead);
|
|
|
|
fault_fs_guard->GetAndResetInjectedThreadLocalErrorCount(
|
|
|
|
FaultInjectionIOType::kMetadataRead);
|
2024-06-22 03:11:57 +00:00
|
|
|
SharedState::ignore_read_error = false;
|
Inject more errors to more files in stress test (#12713)
Summary:
**Context:**
We currently have partial error injection:
- DB operation: all read, SST write
- DB open: all read, SST write, all metadata write.
This PR completes the error injection (with some limitations below):
- DB operation & open: all read, all write, all metadata write, all metadata read
**Summary:**
- Inject retryable metadata read, metadata write error concerning directory (e.g, dir sync, ) or file metadata (e.g, name, size, file creation/deletion...)
- Inject retryable errors to all major file types: random access file, sequential file, writable file
- Allow db stress test operations to handle above injected errors gracefully without crashing
- Change all error injection to thread-local implementation for easier disabling and enabling in the same thread. For example, we can control error handling thread to have no error injection. It's also cleaner in code.
- Limitation: compared to before, we now don't have write fault injection for backup/restore CopyOrCreateFiles work threads since they use anonymous background threads as well as read injection for db open bg thread
- Add a new flag to test error recovery without error injection so we can test the path where error recovery actually succeeds
- Some Refactory & fix to db stress test framework (see PR review comments)
- Fix some minor bugs surfaced (see PR review comments)
- Limitation: had to disable backup restore with metadata read/write injection since it surfaces too many testing issues. Will add it back later to focus on surfacing actual code/internal bugs first.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12713
Test Plan:
- Existing UT
- CI with no trivial error failure
Reviewed By: pdillinger
Differential Revision: D58326608
Pulled By: hx235
fbshipit-source-id: 011b5195aaeb6011641ae0a9194f7f2a0e325ad7
2024-06-19 15:42:00 +00:00
|
|
|
}
|
|
|
|
|
2024-05-09 23:40:22 +00:00
|
|
|
Status s;
|
|
|
|
if (FLAGS_use_attribute_group) {
|
|
|
|
attribute_groups_from_db.emplace_back(cfh);
|
2024-05-30 00:00:11 +00:00
|
|
|
s = db_->GetEntity(read_opts_copy, key_str, &attribute_groups_from_db);
|
2024-05-09 23:40:22 +00:00
|
|
|
if (s.ok()) {
|
|
|
|
s = attribute_groups_from_db.back().status();
|
|
|
|
}
|
|
|
|
} else {
|
2024-05-30 00:00:11 +00:00
|
|
|
s = db_->GetEntity(read_opts_copy, cfh, key_str, &columns_from_db);
|
2024-05-09 23:40:22 +00:00
|
|
|
}
|
2023-03-17 21:47:29 +00:00
|
|
|
|
2024-05-30 00:00:11 +00:00
|
|
|
const ExpectedValue post_read_expected_value =
|
|
|
|
thread->shared->Get(column_family, key);
|
|
|
|
|
Inject more errors to more files in stress test (#12713)
Summary:
**Context:**
We currently have partial error injection:
- DB operation: all read, SST write
- DB open: all read, SST write, all metadata write.
This PR completes the error injection (with some limitations below):
- DB operation & open: all read, all write, all metadata write, all metadata read
**Summary:**
- Inject retryable metadata read, metadata write error concerning directory (e.g, dir sync, ) or file metadata (e.g, name, size, file creation/deletion...)
- Inject retryable errors to all major file types: random access file, sequential file, writable file
- Allow db stress test operations to handle above injected errors gracefully without crashing
- Change all error injection to thread-local implementation for easier disabling and enabling in the same thread. For example, we can control error handling thread to have no error injection. It's also cleaner in code.
- Limitation: compared to before, we now don't have write fault injection for backup/restore CopyOrCreateFiles work threads since they use anonymous background threads as well as read injection for db open bg thread
- Add a new flag to test error recovery without error injection so we can test the path where error recovery actually succeeds
- Some Refactory & fix to db stress test framework (see PR review comments)
- Fix some minor bugs surfaced (see PR review comments)
- Limitation: had to disable backup restore with metadata read/write injection since it surfaces too many testing issues. Will add it back later to focus on surfacing actual code/internal bugs first.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12713
Test Plan:
- Existing UT
- CI with no trivial error failure
Reviewed By: pdillinger
Differential Revision: D58326608
Pulled By: hx235
fbshipit-source-id: 011b5195aaeb6011641ae0a9194f7f2a0e325ad7
2024-06-19 15:42:00 +00:00
|
|
|
int injected_error_count = 0;
|
2024-06-24 04:54:27 +00:00
|
|
|
if (fault_fs_guard) {
|
Inject more errors to more files in stress test (#12713)
Summary:
**Context:**
We currently have partial error injection:
- DB operation: all read, SST write
- DB open: all read, SST write, all metadata write.
This PR completes the error injection (with some limitations below):
- DB operation & open: all read, all write, all metadata write, all metadata read
**Summary:**
- Inject retryable metadata read, metadata write error concerning directory (e.g, dir sync, ) or file metadata (e.g, name, size, file creation/deletion...)
- Inject retryable errors to all major file types: random access file, sequential file, writable file
- Allow db stress test operations to handle above injected errors gracefully without crashing
- Change all error injection to thread-local implementation for easier disabling and enabling in the same thread. For example, we can control error handling thread to have no error injection. It's also cleaner in code.
- Limitation: compared to before, we now don't have write fault injection for backup/restore CopyOrCreateFiles work threads since they use anonymous background threads as well as read injection for db open bg thread
- Add a new flag to test error recovery without error injection so we can test the path where error recovery actually succeeds
- Some Refactory & fix to db stress test framework (see PR review comments)
- Fix some minor bugs surfaced (see PR review comments)
- Limitation: had to disable backup restore with metadata read/write injection since it surfaces too many testing issues. Will add it back later to focus on surfacing actual code/internal bugs first.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12713
Test Plan:
- Existing UT
- CI with no trivial error failure
Reviewed By: pdillinger
Differential Revision: D58326608
Pulled By: hx235
fbshipit-source-id: 011b5195aaeb6011641ae0a9194f7f2a0e325ad7
2024-06-19 15:42:00 +00:00
|
|
|
injected_error_count = GetMinInjectedErrorCount(
|
|
|
|
fault_fs_guard->GetAndResetInjectedThreadLocalErrorCount(
|
|
|
|
FaultInjectionIOType::kRead),
|
|
|
|
fault_fs_guard->GetAndResetInjectedThreadLocalErrorCount(
|
|
|
|
FaultInjectionIOType::kMetadataRead));
|
2024-06-24 04:54:27 +00:00
|
|
|
if (!SharedState::ignore_read_error && injected_error_count > 0 &&
|
|
|
|
(s.ok() || s.IsNotFound())) {
|
Inject more errors to more files in stress test (#12713)
Summary:
**Context:**
We currently have partial error injection:
- DB operation: all read, SST write
- DB open: all read, SST write, all metadata write.
This PR completes the error injection (with some limitations below):
- DB operation & open: all read, all write, all metadata write, all metadata read
**Summary:**
- Inject retryable metadata read, metadata write error concerning directory (e.g, dir sync, ) or file metadata (e.g, name, size, file creation/deletion...)
- Inject retryable errors to all major file types: random access file, sequential file, writable file
- Allow db stress test operations to handle above injected errors gracefully without crashing
- Change all error injection to thread-local implementation for easier disabling and enabling in the same thread. For example, we can control error handling thread to have no error injection. It's also cleaner in code.
- Limitation: compared to before, we now don't have write fault injection for backup/restore CopyOrCreateFiles work threads since they use anonymous background threads as well as read injection for db open bg thread
- Add a new flag to test error recovery without error injection so we can test the path where error recovery actually succeeds
- Some Refactory & fix to db stress test framework (see PR review comments)
- Fix some minor bugs surfaced (see PR review comments)
- Limitation: had to disable backup restore with metadata read/write injection since it surfaces too many testing issues. Will add it back later to focus on surfacing actual code/internal bugs first.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12713
Test Plan:
- Existing UT
- CI with no trivial error failure
Reviewed By: pdillinger
Differential Revision: D58326608
Pulled By: hx235
fbshipit-source-id: 011b5195aaeb6011641ae0a9194f7f2a0e325ad7
2024-06-19 15:42:00 +00:00
|
|
|
// Grab mutex so multiple thread don't try to print the
|
|
|
|
// stack trace at the same time
|
|
|
|
MutexLock l(thread->shared->GetMutex());
|
|
|
|
fprintf(stderr, "Didn't get expected error from GetEntity\n");
|
|
|
|
fprintf(stderr, "Callstack that injected the fault\n");
|
|
|
|
fault_fs_guard->PrintInjectedThreadLocalErrorBacktrace(
|
|
|
|
FaultInjectionIOType::kRead);
|
|
|
|
fault_fs_guard->PrintInjectedThreadLocalErrorBacktrace(
|
|
|
|
FaultInjectionIOType::kMetadataRead);
|
|
|
|
std::terminate();
|
|
|
|
}
|
2023-03-17 21:47:29 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if (s.ok()) {
|
|
|
|
thread->stats.AddGets(1, 1);
|
|
|
|
|
2024-01-22 20:15:17 +00:00
|
|
|
if (!FLAGS_skip_verifydb && !read_older_ts) {
|
2024-05-09 23:40:22 +00:00
|
|
|
if (FLAGS_use_attribute_group) {
|
|
|
|
assert(!attribute_groups_from_db.empty());
|
|
|
|
}
|
|
|
|
const WideColumns& columns =
|
|
|
|
FLAGS_use_attribute_group
|
|
|
|
? attribute_groups_from_db.back().columns()
|
|
|
|
: columns_from_db.columns();
|
2023-03-17 21:47:29 +00:00
|
|
|
if (!VerifyWideColumns(columns)) {
|
|
|
|
shared->SetVerificationFailure();
|
|
|
|
fprintf(stderr,
|
|
|
|
"error : inconsistent columns returned by GetEntity for key "
|
Optimize compaction for standalone range deletion files (#13078)
Summary:
This PR adds some optimization for compacting standalone range deletion files. A standalone range deletion file is one with just a single range deletion. Currently, such a file is used in bulk loading to achieve something like atomically delete old version of all data with one big range deletion and adding new version of data. These are the changes included in the PR:
1) When a standalone range deletion file is ingested via bulk loading, it's marked for compaction.
2) When picking input files during compaction picking, we attempt to only pick a standalone range deletion file when oldest snapshot is at or above the file's seqno. To do this, `PickCompaction` API is updated to take existing snapshots as an input. This is only done for the universal compaction + UDT disabled combination, we save querying for existing snapshots and not pass it for all other cases.
3) At `Compaction` construction time, the input files will be filtered to examine if any of them can be skipped for compaction iterator. For example, if all the data of the file is deleted by a standalone range tombstone, and the oldest snapshot is at or above such range tombstone, this file will be filtered out.
4) Every time a snapshot is released, we examine if any column family has standalone range deletion files that becomes eligible to be scheduled for compaction. And schedule one for it.
Potential future improvements:
- Add some dedicated statistics for the filtered files.
- Extend this input filtering to L0 files' compactions cases when a newer L0 file could shadow an older L0 file
Pull Request resolved: https://github.com/facebook/rocksdb/pull/13078
Test Plan: Added unit tests and stress tested a few rounds
Reviewed By: cbi42
Differential Revision: D64879415
Pulled By: jowlyzhang
fbshipit-source-id: 02b8683fddbe11f093bcaa0a38406deb39f44d9e
2024-10-25 16:32:14 +00:00
|
|
|
"%s (%" PRIi64 "): %s\n",
|
|
|
|
StringToHex(key_str).c_str(), rand_keys[0],
|
2024-05-30 00:00:11 +00:00
|
|
|
WideColumnsToHex(columns).c_str());
|
|
|
|
} else if (ExpectedValueHelper::MustHaveNotExisted(
|
|
|
|
pre_read_expected_value, post_read_expected_value)) {
|
2023-03-17 21:47:29 +00:00
|
|
|
shared->SetVerificationFailure();
|
Optimize compaction for standalone range deletion files (#13078)
Summary:
This PR adds some optimization for compacting standalone range deletion files. A standalone range deletion file is one with just a single range deletion. Currently, such a file is used in bulk loading to achieve something like atomically delete old version of all data with one big range deletion and adding new version of data. These are the changes included in the PR:
1) When a standalone range deletion file is ingested via bulk loading, it's marked for compaction.
2) When picking input files during compaction picking, we attempt to only pick a standalone range deletion file when oldest snapshot is at or above the file's seqno. To do this, `PickCompaction` API is updated to take existing snapshots as an input. This is only done for the universal compaction + UDT disabled combination, we save querying for existing snapshots and not pass it for all other cases.
3) At `Compaction` construction time, the input files will be filtered to examine if any of them can be skipped for compaction iterator. For example, if all the data of the file is deleted by a standalone range tombstone, and the oldest snapshot is at or above such range tombstone, this file will be filtered out.
4) Every time a snapshot is released, we examine if any column family has standalone range deletion files that becomes eligible to be scheduled for compaction. And schedule one for it.
Potential future improvements:
- Add some dedicated statistics for the filtered files.
- Extend this input filtering to L0 files' compactions cases when a newer L0 file could shadow an older L0 file
Pull Request resolved: https://github.com/facebook/rocksdb/pull/13078
Test Plan: Added unit tests and stress tested a few rounds
Reviewed By: cbi42
Differential Revision: D64879415
Pulled By: jowlyzhang
fbshipit-source-id: 02b8683fddbe11f093bcaa0a38406deb39f44d9e
2024-10-25 16:32:14 +00:00
|
|
|
fprintf(stderr,
|
|
|
|
"error : inconsistent values for key %s (%" PRIi64
|
|
|
|
"): GetEntity returns %s, "
|
|
|
|
"expected state does not have the key.\n",
|
|
|
|
StringToHex(key_str).c_str(), rand_keys[0],
|
|
|
|
WideColumnsToHex(columns).c_str());
|
2024-05-30 00:00:11 +00:00
|
|
|
} else {
|
|
|
|
const uint32_t value_base_from_db =
|
|
|
|
GetValueBase(WideColumnsHelper::GetDefaultColumn(columns));
|
|
|
|
if (!ExpectedValueHelper::InExpectedValueBaseRange(
|
|
|
|
value_base_from_db, pre_read_expected_value,
|
|
|
|
post_read_expected_value)) {
|
|
|
|
shared->SetVerificationFailure();
|
|
|
|
fprintf(
|
|
|
|
stderr,
|
Optimize compaction for standalone range deletion files (#13078)
Summary:
This PR adds some optimization for compacting standalone range deletion files. A standalone range deletion file is one with just a single range deletion. Currently, such a file is used in bulk loading to achieve something like atomically delete old version of all data with one big range deletion and adding new version of data. These are the changes included in the PR:
1) When a standalone range deletion file is ingested via bulk loading, it's marked for compaction.
2) When picking input files during compaction picking, we attempt to only pick a standalone range deletion file when oldest snapshot is at or above the file's seqno. To do this, `PickCompaction` API is updated to take existing snapshots as an input. This is only done for the universal compaction + UDT disabled combination, we save querying for existing snapshots and not pass it for all other cases.
3) At `Compaction` construction time, the input files will be filtered to examine if any of them can be skipped for compaction iterator. For example, if all the data of the file is deleted by a standalone range tombstone, and the oldest snapshot is at or above such range tombstone, this file will be filtered out.
4) Every time a snapshot is released, we examine if any column family has standalone range deletion files that becomes eligible to be scheduled for compaction. And schedule one for it.
Potential future improvements:
- Add some dedicated statistics for the filtered files.
- Extend this input filtering to L0 files' compactions cases when a newer L0 file could shadow an older L0 file
Pull Request resolved: https://github.com/facebook/rocksdb/pull/13078
Test Plan: Added unit tests and stress tested a few rounds
Reviewed By: cbi42
Differential Revision: D64879415
Pulled By: jowlyzhang
fbshipit-source-id: 02b8683fddbe11f093bcaa0a38406deb39f44d9e
2024-10-25 16:32:14 +00:00
|
|
|
"error : inconsistent values for key %s (%" PRIi64
|
|
|
|
"): GetEntity returns %s "
|
2024-05-30 00:00:11 +00:00
|
|
|
"with value base %d that falls out of expected state's value "
|
|
|
|
"base range.\n",
|
Optimize compaction for standalone range deletion files (#13078)
Summary:
This PR adds some optimization for compacting standalone range deletion files. A standalone range deletion file is one with just a single range deletion. Currently, such a file is used in bulk loading to achieve something like atomically delete old version of all data with one big range deletion and adding new version of data. These are the changes included in the PR:
1) When a standalone range deletion file is ingested via bulk loading, it's marked for compaction.
2) When picking input files during compaction picking, we attempt to only pick a standalone range deletion file when oldest snapshot is at or above the file's seqno. To do this, `PickCompaction` API is updated to take existing snapshots as an input. This is only done for the universal compaction + UDT disabled combination, we save querying for existing snapshots and not pass it for all other cases.
3) At `Compaction` construction time, the input files will be filtered to examine if any of them can be skipped for compaction iterator. For example, if all the data of the file is deleted by a standalone range tombstone, and the oldest snapshot is at or above such range tombstone, this file will be filtered out.
4) Every time a snapshot is released, we examine if any column family has standalone range deletion files that becomes eligible to be scheduled for compaction. And schedule one for it.
Potential future improvements:
- Add some dedicated statistics for the filtered files.
- Extend this input filtering to L0 files' compactions cases when a newer L0 file could shadow an older L0 file
Pull Request resolved: https://github.com/facebook/rocksdb/pull/13078
Test Plan: Added unit tests and stress tested a few rounds
Reviewed By: cbi42
Differential Revision: D64879415
Pulled By: jowlyzhang
fbshipit-source-id: 02b8683fddbe11f093bcaa0a38406deb39f44d9e
2024-10-25 16:32:14 +00:00
|
|
|
StringToHex(key_str).c_str(), rand_keys[0],
|
|
|
|
WideColumnsToHex(columns).c_str(), value_base_from_db);
|
2024-05-30 00:00:11 +00:00
|
|
|
}
|
2023-03-17 21:47:29 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
} else if (s.IsNotFound()) {
|
|
|
|
thread->stats.AddGets(1, 0);
|
|
|
|
|
2024-01-11 00:35:54 +00:00
|
|
|
if (!FLAGS_skip_verifydb && !read_older_ts) {
|
2024-05-30 00:00:11 +00:00
|
|
|
if (ExpectedValueHelper::MustHaveExisted(pre_read_expected_value,
|
|
|
|
post_read_expected_value)) {
|
2023-03-17 21:47:29 +00:00
|
|
|
shared->SetVerificationFailure();
|
|
|
|
fprintf(stderr,
|
Optimize compaction for standalone range deletion files (#13078)
Summary:
This PR adds some optimization for compacting standalone range deletion files. A standalone range deletion file is one with just a single range deletion. Currently, such a file is used in bulk loading to achieve something like atomically delete old version of all data with one big range deletion and adding new version of data. These are the changes included in the PR:
1) When a standalone range deletion file is ingested via bulk loading, it's marked for compaction.
2) When picking input files during compaction picking, we attempt to only pick a standalone range deletion file when oldest snapshot is at or above the file's seqno. To do this, `PickCompaction` API is updated to take existing snapshots as an input. This is only done for the universal compaction + UDT disabled combination, we save querying for existing snapshots and not pass it for all other cases.
3) At `Compaction` construction time, the input files will be filtered to examine if any of them can be skipped for compaction iterator. For example, if all the data of the file is deleted by a standalone range tombstone, and the oldest snapshot is at or above such range tombstone, this file will be filtered out.
4) Every time a snapshot is released, we examine if any column family has standalone range deletion files that becomes eligible to be scheduled for compaction. And schedule one for it.
Potential future improvements:
- Add some dedicated statistics for the filtered files.
- Extend this input filtering to L0 files' compactions cases when a newer L0 file could shadow an older L0 file
Pull Request resolved: https://github.com/facebook/rocksdb/pull/13078
Test Plan: Added unit tests and stress tested a few rounds
Reviewed By: cbi42
Differential Revision: D64879415
Pulled By: jowlyzhang
fbshipit-source-id: 02b8683fddbe11f093bcaa0a38406deb39f44d9e
2024-10-25 16:32:14 +00:00
|
|
|
"error : inconsistent values for key %s (%" PRIi64
|
|
|
|
"): expected state has "
|
2023-03-17 21:47:29 +00:00
|
|
|
"the key, GetEntity returns NotFound.\n",
|
Optimize compaction for standalone range deletion files (#13078)
Summary:
This PR adds some optimization for compacting standalone range deletion files. A standalone range deletion file is one with just a single range deletion. Currently, such a file is used in bulk loading to achieve something like atomically delete old version of all data with one big range deletion and adding new version of data. These are the changes included in the PR:
1) When a standalone range deletion file is ingested via bulk loading, it's marked for compaction.
2) When picking input files during compaction picking, we attempt to only pick a standalone range deletion file when oldest snapshot is at or above the file's seqno. To do this, `PickCompaction` API is updated to take existing snapshots as an input. This is only done for the universal compaction + UDT disabled combination, we save querying for existing snapshots and not pass it for all other cases.
3) At `Compaction` construction time, the input files will be filtered to examine if any of them can be skipped for compaction iterator. For example, if all the data of the file is deleted by a standalone range tombstone, and the oldest snapshot is at or above such range tombstone, this file will be filtered out.
4) Every time a snapshot is released, we examine if any column family has standalone range deletion files that becomes eligible to be scheduled for compaction. And schedule one for it.
Potential future improvements:
- Add some dedicated statistics for the filtered files.
- Extend this input filtering to L0 files' compactions cases when a newer L0 file could shadow an older L0 file
Pull Request resolved: https://github.com/facebook/rocksdb/pull/13078
Test Plan: Added unit tests and stress tested a few rounds
Reviewed By: cbi42
Differential Revision: D64879415
Pulled By: jowlyzhang
fbshipit-source-id: 02b8683fddbe11f093bcaa0a38406deb39f44d9e
2024-10-25 16:32:14 +00:00
|
|
|
StringToHex(key_str).c_str(), rand_keys[0]);
|
2023-03-17 21:47:29 +00:00
|
|
|
}
|
|
|
|
}
|
2024-06-25 03:51:39 +00:00
|
|
|
} else if (injected_error_count == 0 || !IsErrorInjectedAndRetryable(s)) {
|
Optimize compaction for standalone range deletion files (#13078)
Summary:
This PR adds some optimization for compacting standalone range deletion files. A standalone range deletion file is one with just a single range deletion. Currently, such a file is used in bulk loading to achieve something like atomically delete old version of all data with one big range deletion and adding new version of data. These are the changes included in the PR:
1) When a standalone range deletion file is ingested via bulk loading, it's marked for compaction.
2) When picking input files during compaction picking, we attempt to only pick a standalone range deletion file when oldest snapshot is at or above the file's seqno. To do this, `PickCompaction` API is updated to take existing snapshots as an input. This is only done for the universal compaction + UDT disabled combination, we save querying for existing snapshots and not pass it for all other cases.
3) At `Compaction` construction time, the input files will be filtered to examine if any of them can be skipped for compaction iterator. For example, if all the data of the file is deleted by a standalone range tombstone, and the oldest snapshot is at or above such range tombstone, this file will be filtered out.
4) Every time a snapshot is released, we examine if any column family has standalone range deletion files that becomes eligible to be scheduled for compaction. And schedule one for it.
Potential future improvements:
- Add some dedicated statistics for the filtered files.
- Extend this input filtering to L0 files' compactions cases when a newer L0 file could shadow an older L0 file
Pull Request resolved: https://github.com/facebook/rocksdb/pull/13078
Test Plan: Added unit tests and stress tested a few rounds
Reviewed By: cbi42
Differential Revision: D64879415
Pulled By: jowlyzhang
fbshipit-source-id: 02b8683fddbe11f093bcaa0a38406deb39f44d9e
2024-10-25 16:32:14 +00:00
|
|
|
fprintf(stderr,
|
|
|
|
"error : GetEntity() returns %s for key: %s (%" PRIi64 ").\n",
|
|
|
|
s.ToString().c_str(), StringToHex(key_str).c_str(), rand_keys[0]);
|
Inject more errors to more files in stress test (#12713)
Summary:
**Context:**
We currently have partial error injection:
- DB operation: all read, SST write
- DB open: all read, SST write, all metadata write.
This PR completes the error injection (with some limitations below):
- DB operation & open: all read, all write, all metadata write, all metadata read
**Summary:**
- Inject retryable metadata read, metadata write error concerning directory (e.g, dir sync, ) or file metadata (e.g, name, size, file creation/deletion...)
- Inject retryable errors to all major file types: random access file, sequential file, writable file
- Allow db stress test operations to handle above injected errors gracefully without crashing
- Change all error injection to thread-local implementation for easier disabling and enabling in the same thread. For example, we can control error handling thread to have no error injection. It's also cleaner in code.
- Limitation: compared to before, we now don't have write fault injection for backup/restore CopyOrCreateFiles work threads since they use anonymous background threads as well as read injection for db open bg thread
- Add a new flag to test error recovery without error injection so we can test the path where error recovery actually succeeds
- Some Refactory & fix to db stress test framework (see PR review comments)
- Fix some minor bugs surfaced (see PR review comments)
- Limitation: had to disable backup restore with metadata read/write injection since it surfaces too many testing issues. Will add it back later to focus on surfacing actual code/internal bugs first.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12713
Test Plan:
- Existing UT
- CI with no trivial error failure
Reviewed By: pdillinger
Differential Revision: D58326608
Pulled By: hx235
fbshipit-source-id: 011b5195aaeb6011641ae0a9194f7f2a0e325ad7
2024-06-19 15:42:00 +00:00
|
|
|
thread->shared->SetVerificationFailure();
|
2023-03-17 21:47:29 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-03-30 03:35:15 +00:00
|
|
|
void TestMultiGetEntity(ThreadState* thread, const ReadOptions& read_opts,
|
|
|
|
const std::vector<int>& rand_column_families,
|
|
|
|
const std::vector<int64_t>& rand_keys) override {
|
|
|
|
assert(thread);
|
|
|
|
|
|
|
|
ManagedSnapshot snapshot_guard(db_);
|
|
|
|
|
|
|
|
ReadOptions read_opts_copy(read_opts);
|
|
|
|
read_opts_copy.snapshot = snapshot_guard.snapshot();
|
|
|
|
|
|
|
|
assert(!rand_column_families.empty());
|
|
|
|
|
2024-05-31 19:01:59 +00:00
|
|
|
const int column_family = rand_column_families[0];
|
|
|
|
|
|
|
|
assert(column_family >= 0);
|
|
|
|
assert(column_family < static_cast<int>(column_families_.size()));
|
|
|
|
|
|
|
|
ColumnFamilyHandle* const cfh = column_families_[column_family];
|
2023-03-30 03:35:15 +00:00
|
|
|
assert(cfh);
|
|
|
|
|
|
|
|
assert(!rand_keys.empty());
|
|
|
|
|
|
|
|
const size_t num_keys = rand_keys.size();
|
|
|
|
|
2024-05-31 19:01:59 +00:00
|
|
|
std::unique_ptr<Transaction> txn;
|
2023-03-30 03:35:15 +00:00
|
|
|
|
2024-05-31 19:01:59 +00:00
|
|
|
if (FLAGS_use_txn) {
|
Inject more errors to more files in stress test (#12713)
Summary:
**Context:**
We currently have partial error injection:
- DB operation: all read, SST write
- DB open: all read, SST write, all metadata write.
This PR completes the error injection (with some limitations below):
- DB operation & open: all read, all write, all metadata write, all metadata read
**Summary:**
- Inject retryable metadata read, metadata write error concerning directory (e.g, dir sync, ) or file metadata (e.g, name, size, file creation/deletion...)
- Inject retryable errors to all major file types: random access file, sequential file, writable file
- Allow db stress test operations to handle above injected errors gracefully without crashing
- Change all error injection to thread-local implementation for easier disabling and enabling in the same thread. For example, we can control error handling thread to have no error injection. It's also cleaner in code.
- Limitation: compared to before, we now don't have write fault injection for backup/restore CopyOrCreateFiles work threads since they use anonymous background threads as well as read injection for db open bg thread
- Add a new flag to test error recovery without error injection so we can test the path where error recovery actually succeeds
- Some Refactory & fix to db stress test framework (see PR review comments)
- Fix some minor bugs surfaced (see PR review comments)
- Limitation: had to disable backup restore with metadata read/write injection since it surfaces too many testing issues. Will add it back later to focus on surfacing actual code/internal bugs first.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12713
Test Plan:
- Existing UT
- CI with no trivial error failure
Reviewed By: pdillinger
Differential Revision: D58326608
Pulled By: hx235
fbshipit-source-id: 011b5195aaeb6011641ae0a9194f7f2a0e325ad7
2024-06-19 15:42:00 +00:00
|
|
|
// TODO(hx235): test fault injection with MultiGetEntity() with
|
|
|
|
// transactions
|
|
|
|
if (fault_fs_guard) {
|
|
|
|
fault_fs_guard->DisableThreadLocalErrorInjection(
|
|
|
|
FaultInjectionIOType::kRead);
|
|
|
|
fault_fs_guard->DisableThreadLocalErrorInjection(
|
|
|
|
FaultInjectionIOType::kMetadataRead);
|
|
|
|
}
|
2024-05-31 19:01:59 +00:00
|
|
|
WriteOptions write_options;
|
|
|
|
if (FLAGS_rate_limit_auto_wal_flush) {
|
|
|
|
write_options.rate_limiter_priority = Env::IO_USER;
|
|
|
|
}
|
|
|
|
|
|
|
|
const Status s = NewTxn(write_options, &txn);
|
|
|
|
if (!s.ok()) {
|
|
|
|
fprintf(stderr, "NewTxn error: %s\n", s.ToString().c_str());
|
|
|
|
thread->shared->SafeTerminate();
|
|
|
|
}
|
2024-05-14 23:33:44 +00:00
|
|
|
}
|
|
|
|
|
2024-05-31 19:01:59 +00:00
|
|
|
std::vector<std::string> keys(num_keys);
|
|
|
|
std::vector<Slice> key_slices(num_keys);
|
|
|
|
std::unordered_map<std::string, ExpectedValue> ryw_expected_values;
|
|
|
|
|
2023-03-30 03:35:15 +00:00
|
|
|
for (size_t i = 0; i < num_keys; ++i) {
|
2024-05-31 19:01:59 +00:00
|
|
|
const int64_t key = rand_keys[i];
|
|
|
|
|
|
|
|
keys[i] = Key(key);
|
2023-03-30 03:35:15 +00:00
|
|
|
key_slices[i] = keys[i];
|
2024-05-31 19:01:59 +00:00
|
|
|
|
|
|
|
if (FLAGS_use_txn) {
|
|
|
|
MaybeAddKeyToTxnForRYW(thread, column_family, key, txn.get(),
|
|
|
|
ryw_expected_values);
|
|
|
|
}
|
2023-03-30 03:35:15 +00:00
|
|
|
}
|
|
|
|
|
Inject more errors to more files in stress test (#12713)
Summary:
**Context:**
We currently have partial error injection:
- DB operation: all read, SST write
- DB open: all read, SST write, all metadata write.
This PR completes the error injection (with some limitations below):
- DB operation & open: all read, all write, all metadata write, all metadata read
**Summary:**
- Inject retryable metadata read, metadata write error concerning directory (e.g, dir sync, ) or file metadata (e.g, name, size, file creation/deletion...)
- Inject retryable errors to all major file types: random access file, sequential file, writable file
- Allow db stress test operations to handle above injected errors gracefully without crashing
- Change all error injection to thread-local implementation for easier disabling and enabling in the same thread. For example, we can control error handling thread to have no error injection. It's also cleaner in code.
- Limitation: compared to before, we now don't have write fault injection for backup/restore CopyOrCreateFiles work threads since they use anonymous background threads as well as read injection for db open bg thread
- Add a new flag to test error recovery without error injection so we can test the path where error recovery actually succeeds
- Some Refactory & fix to db stress test framework (see PR review comments)
- Fix some minor bugs surfaced (see PR review comments)
- Limitation: had to disable backup restore with metadata read/write injection since it surfaces too many testing issues. Will add it back later to focus on surfacing actual code/internal bugs first.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12713
Test Plan:
- Existing UT
- CI with no trivial error failure
Reviewed By: pdillinger
Differential Revision: D58326608
Pulled By: hx235
fbshipit-source-id: 011b5195aaeb6011641ae0a9194f7f2a0e325ad7
2024-06-19 15:42:00 +00:00
|
|
|
int injected_error_count = 0;
|
2023-03-30 03:35:15 +00:00
|
|
|
|
2024-05-30 20:31:25 +00:00
|
|
|
auto verify_expected_errors = [&](auto get_status) {
|
|
|
|
assert(fault_fs_guard);
|
Inject more errors to more files in stress test (#12713)
Summary:
**Context:**
We currently have partial error injection:
- DB operation: all read, SST write
- DB open: all read, SST write, all metadata write.
This PR completes the error injection (with some limitations below):
- DB operation & open: all read, all write, all metadata write, all metadata read
**Summary:**
- Inject retryable metadata read, metadata write error concerning directory (e.g, dir sync, ) or file metadata (e.g, name, size, file creation/deletion...)
- Inject retryable errors to all major file types: random access file, sequential file, writable file
- Allow db stress test operations to handle above injected errors gracefully without crashing
- Change all error injection to thread-local implementation for easier disabling and enabling in the same thread. For example, we can control error handling thread to have no error injection. It's also cleaner in code.
- Limitation: compared to before, we now don't have write fault injection for backup/restore CopyOrCreateFiles work threads since they use anonymous background threads as well as read injection for db open bg thread
- Add a new flag to test error recovery without error injection so we can test the path where error recovery actually succeeds
- Some Refactory & fix to db stress test framework (see PR review comments)
- Fix some minor bugs surfaced (see PR review comments)
- Limitation: had to disable backup restore with metadata read/write injection since it surfaces too many testing issues. Will add it back later to focus on surfacing actual code/internal bugs first.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12713
Test Plan:
- Existing UT
- CI with no trivial error failure
Reviewed By: pdillinger
Differential Revision: D58326608
Pulled By: hx235
fbshipit-source-id: 011b5195aaeb6011641ae0a9194f7f2a0e325ad7
2024-06-19 15:42:00 +00:00
|
|
|
injected_error_count = GetMinInjectedErrorCount(
|
|
|
|
fault_fs_guard->GetAndResetInjectedThreadLocalErrorCount(
|
|
|
|
FaultInjectionIOType::kRead),
|
|
|
|
fault_fs_guard->GetAndResetInjectedThreadLocalErrorCount(
|
|
|
|
FaultInjectionIOType::kMetadataRead));
|
|
|
|
if (injected_error_count) {
|
|
|
|
int stat_nok_nfound = 0;
|
2024-05-30 20:31:25 +00:00
|
|
|
for (size_t i = 0; i < num_keys; ++i) {
|
|
|
|
const Status& s = get_status(i);
|
|
|
|
if (!s.ok() && !s.IsNotFound()) {
|
Inject more errors to more files in stress test (#12713)
Summary:
**Context:**
We currently have partial error injection:
- DB operation: all read, SST write
- DB open: all read, SST write, all metadata write.
This PR completes the error injection (with some limitations below):
- DB operation & open: all read, all write, all metadata write, all metadata read
**Summary:**
- Inject retryable metadata read, metadata write error concerning directory (e.g, dir sync, ) or file metadata (e.g, name, size, file creation/deletion...)
- Inject retryable errors to all major file types: random access file, sequential file, writable file
- Allow db stress test operations to handle above injected errors gracefully without crashing
- Change all error injection to thread-local implementation for easier disabling and enabling in the same thread. For example, we can control error handling thread to have no error injection. It's also cleaner in code.
- Limitation: compared to before, we now don't have write fault injection for backup/restore CopyOrCreateFiles work threads since they use anonymous background threads as well as read injection for db open bg thread
- Add a new flag to test error recovery without error injection so we can test the path where error recovery actually succeeds
- Some Refactory & fix to db stress test framework (see PR review comments)
- Fix some minor bugs surfaced (see PR review comments)
- Limitation: had to disable backup restore with metadata read/write injection since it surfaces too many testing issues. Will add it back later to focus on surfacing actual code/internal bugs first.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12713
Test Plan:
- Existing UT
- CI with no trivial error failure
Reviewed By: pdillinger
Differential Revision: D58326608
Pulled By: hx235
fbshipit-source-id: 011b5195aaeb6011641ae0a9194f7f2a0e325ad7
2024-06-19 15:42:00 +00:00
|
|
|
++stat_nok_nfound;
|
2024-05-14 23:33:44 +00:00
|
|
|
}
|
|
|
|
}
|
2023-03-30 03:35:15 +00:00
|
|
|
|
2024-06-24 04:54:27 +00:00
|
|
|
if (!SharedState::ignore_read_error &&
|
|
|
|
stat_nok_nfound < injected_error_count) {
|
2024-05-30 20:31:25 +00:00
|
|
|
// Grab mutex so multiple threads don't try to print the
|
|
|
|
// stack trace at the same time
|
|
|
|
assert(thread->shared);
|
|
|
|
MutexLock l(thread->shared->GetMutex());
|
2023-03-30 03:35:15 +00:00
|
|
|
|
2024-05-30 20:31:25 +00:00
|
|
|
fprintf(stderr, "Didn't get expected error from MultiGetEntity\n");
|
|
|
|
fprintf(stderr, "num_keys %zu Expected %d errors, seen %d\n",
|
Inject more errors to more files in stress test (#12713)
Summary:
**Context:**
We currently have partial error injection:
- DB operation: all read, SST write
- DB open: all read, SST write, all metadata write.
This PR completes the error injection (with some limitations below):
- DB operation & open: all read, all write, all metadata write, all metadata read
**Summary:**
- Inject retryable metadata read, metadata write error concerning directory (e.g, dir sync, ) or file metadata (e.g, name, size, file creation/deletion...)
- Inject retryable errors to all major file types: random access file, sequential file, writable file
- Allow db stress test operations to handle above injected errors gracefully without crashing
- Change all error injection to thread-local implementation for easier disabling and enabling in the same thread. For example, we can control error handling thread to have no error injection. It's also cleaner in code.
- Limitation: compared to before, we now don't have write fault injection for backup/restore CopyOrCreateFiles work threads since they use anonymous background threads as well as read injection for db open bg thread
- Add a new flag to test error recovery without error injection so we can test the path where error recovery actually succeeds
- Some Refactory & fix to db stress test framework (see PR review comments)
- Fix some minor bugs surfaced (see PR review comments)
- Limitation: had to disable backup restore with metadata read/write injection since it surfaces too many testing issues. Will add it back later to focus on surfacing actual code/internal bugs first.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12713
Test Plan:
- Existing UT
- CI with no trivial error failure
Reviewed By: pdillinger
Differential Revision: D58326608
Pulled By: hx235
fbshipit-source-id: 011b5195aaeb6011641ae0a9194f7f2a0e325ad7
2024-06-19 15:42:00 +00:00
|
|
|
num_keys, injected_error_count, stat_nok_nfound);
|
2024-05-30 20:31:25 +00:00
|
|
|
fprintf(stderr, "Call stack that injected the fault\n");
|
Inject more errors to more files in stress test (#12713)
Summary:
**Context:**
We currently have partial error injection:
- DB operation: all read, SST write
- DB open: all read, SST write, all metadata write.
This PR completes the error injection (with some limitations below):
- DB operation & open: all read, all write, all metadata write, all metadata read
**Summary:**
- Inject retryable metadata read, metadata write error concerning directory (e.g, dir sync, ) or file metadata (e.g, name, size, file creation/deletion...)
- Inject retryable errors to all major file types: random access file, sequential file, writable file
- Allow db stress test operations to handle above injected errors gracefully without crashing
- Change all error injection to thread-local implementation for easier disabling and enabling in the same thread. For example, we can control error handling thread to have no error injection. It's also cleaner in code.
- Limitation: compared to before, we now don't have write fault injection for backup/restore CopyOrCreateFiles work threads since they use anonymous background threads as well as read injection for db open bg thread
- Add a new flag to test error recovery without error injection so we can test the path where error recovery actually succeeds
- Some Refactory & fix to db stress test framework (see PR review comments)
- Fix some minor bugs surfaced (see PR review comments)
- Limitation: had to disable backup restore with metadata read/write injection since it surfaces too many testing issues. Will add it back later to focus on surfacing actual code/internal bugs first.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12713
Test Plan:
- Existing UT
- CI with no trivial error failure
Reviewed By: pdillinger
Differential Revision: D58326608
Pulled By: hx235
fbshipit-source-id: 011b5195aaeb6011641ae0a9194f7f2a0e325ad7
2024-06-19 15:42:00 +00:00
|
|
|
fault_fs_guard->PrintInjectedThreadLocalErrorBacktrace(
|
|
|
|
FaultInjectionIOType::kRead);
|
|
|
|
fault_fs_guard->PrintInjectedThreadLocalErrorBacktrace(
|
|
|
|
FaultInjectionIOType::kMetadataRead);
|
2024-05-30 20:31:25 +00:00
|
|
|
std::terminate();
|
2024-05-14 23:33:44 +00:00
|
|
|
}
|
|
|
|
}
|
2024-05-30 20:31:25 +00:00
|
|
|
};
|
2024-05-14 23:33:44 +00:00
|
|
|
|
2024-05-31 19:01:59 +00:00
|
|
|
auto check_results = [&](auto get_columns, auto get_status,
|
|
|
|
auto do_extra_check, auto call_get_entity) {
|
Inject more errors to more files in stress test (#12713)
Summary:
**Context:**
We currently have partial error injection:
- DB operation: all read, SST write
- DB open: all read, SST write, all metadata write.
This PR completes the error injection (with some limitations below):
- DB operation & open: all read, all write, all metadata write, all metadata read
**Summary:**
- Inject retryable metadata read, metadata write error concerning directory (e.g, dir sync, ) or file metadata (e.g, name, size, file creation/deletion...)
- Inject retryable errors to all major file types: random access file, sequential file, writable file
- Allow db stress test operations to handle above injected errors gracefully without crashing
- Change all error injection to thread-local implementation for easier disabling and enabling in the same thread. For example, we can control error handling thread to have no error injection. It's also cleaner in code.
- Limitation: compared to before, we now don't have write fault injection for backup/restore CopyOrCreateFiles work threads since they use anonymous background threads as well as read injection for db open bg thread
- Add a new flag to test error recovery without error injection so we can test the path where error recovery actually succeeds
- Some Refactory & fix to db stress test framework (see PR review comments)
- Fix some minor bugs surfaced (see PR review comments)
- Limitation: had to disable backup restore with metadata read/write injection since it surfaces too many testing issues. Will add it back later to focus on surfacing actual code/internal bugs first.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12713
Test Plan:
- Existing UT
- CI with no trivial error failure
Reviewed By: pdillinger
Differential Revision: D58326608
Pulled By: hx235
fbshipit-source-id: 011b5195aaeb6011641ae0a9194f7f2a0e325ad7
2024-06-19 15:42:00 +00:00
|
|
|
// Temporarily disable error injection for checking results
|
|
|
|
if (fault_fs_guard) {
|
|
|
|
fault_fs_guard->DisableThreadLocalErrorInjection(
|
|
|
|
FaultInjectionIOType::kRead);
|
|
|
|
fault_fs_guard->DisableThreadLocalErrorInjection(
|
|
|
|
FaultInjectionIOType::kMetadataRead);
|
|
|
|
}
|
2024-05-14 23:33:44 +00:00
|
|
|
const bool check_get_entity =
|
Inject more errors to more files in stress test (#12713)
Summary:
**Context:**
We currently have partial error injection:
- DB operation: all read, SST write
- DB open: all read, SST write, all metadata write.
This PR completes the error injection (with some limitations below):
- DB operation & open: all read, all write, all metadata write, all metadata read
**Summary:**
- Inject retryable metadata read, metadata write error concerning directory (e.g, dir sync, ) or file metadata (e.g, name, size, file creation/deletion...)
- Inject retryable errors to all major file types: random access file, sequential file, writable file
- Allow db stress test operations to handle above injected errors gracefully without crashing
- Change all error injection to thread-local implementation for easier disabling and enabling in the same thread. For example, we can control error handling thread to have no error injection. It's also cleaner in code.
- Limitation: compared to before, we now don't have write fault injection for backup/restore CopyOrCreateFiles work threads since they use anonymous background threads as well as read injection for db open bg thread
- Add a new flag to test error recovery without error injection so we can test the path where error recovery actually succeeds
- Some Refactory & fix to db stress test framework (see PR review comments)
- Fix some minor bugs surfaced (see PR review comments)
- Limitation: had to disable backup restore with metadata read/write injection since it surfaces too many testing issues. Will add it back later to focus on surfacing actual code/internal bugs first.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12713
Test Plan:
- Existing UT
- CI with no trivial error failure
Reviewed By: pdillinger
Differential Revision: D58326608
Pulled By: hx235
fbshipit-source-id: 011b5195aaeb6011641ae0a9194f7f2a0e325ad7
2024-06-19 15:42:00 +00:00
|
|
|
!injected_error_count && FLAGS_check_multiget_entity_consistency;
|
2024-05-14 23:33:44 +00:00
|
|
|
|
|
|
|
for (size_t i = 0; i < num_keys; ++i) {
|
2024-05-30 20:31:25 +00:00
|
|
|
const WideColumns& columns = get_columns(i);
|
|
|
|
const Status& s = get_status(i);
|
2024-05-14 23:33:44 +00:00
|
|
|
|
|
|
|
bool is_consistent = true;
|
|
|
|
|
2024-05-30 20:31:25 +00:00
|
|
|
if (s.ok() && !VerifyWideColumns(columns)) {
|
2024-05-14 23:33:44 +00:00
|
|
|
fprintf(
|
|
|
|
stderr,
|
|
|
|
"error : inconsistent columns returned by MultiGetEntity for key "
|
|
|
|
"%s: %s\n",
|
2024-05-30 20:31:25 +00:00
|
|
|
StringToHex(keys[i]).c_str(), WideColumnsToHex(columns).c_str());
|
2024-05-14 23:33:44 +00:00
|
|
|
is_consistent = false;
|
2024-05-31 19:01:59 +00:00
|
|
|
} else if (s.ok() || s.IsNotFound()) {
|
|
|
|
if (!do_extra_check(keys[i], columns, s)) {
|
2023-03-30 03:35:15 +00:00
|
|
|
is_consistent = false;
|
2024-05-31 19:01:59 +00:00
|
|
|
} else if (check_get_entity) {
|
|
|
|
PinnableWideColumns cmp_result;
|
|
|
|
ThreadStatusUtil::SetThreadOperation(
|
|
|
|
ThreadStatus::OperationType::OP_GETENTITY);
|
|
|
|
const Status cmp_s = call_get_entity(key_slices[i], &cmp_result);
|
|
|
|
|
|
|
|
if (!cmp_s.ok() && !cmp_s.IsNotFound()) {
|
|
|
|
fprintf(stderr, "GetEntity error: %s\n",
|
|
|
|
cmp_s.ToString().c_str());
|
2023-03-30 03:35:15 +00:00
|
|
|
is_consistent = false;
|
2024-05-31 19:01:59 +00:00
|
|
|
} else if (cmp_s.IsNotFound()) {
|
|
|
|
if (s.ok()) {
|
|
|
|
fprintf(
|
|
|
|
stderr,
|
|
|
|
"Inconsistent results for key %s: MultiGetEntity returned "
|
|
|
|
"ok, GetEntity returned not found\n",
|
|
|
|
StringToHex(keys[i]).c_str());
|
|
|
|
is_consistent = false;
|
|
|
|
}
|
2024-05-14 23:33:44 +00:00
|
|
|
} else {
|
2024-05-31 19:01:59 +00:00
|
|
|
assert(cmp_s.ok());
|
2024-05-14 23:33:44 +00:00
|
|
|
|
2024-05-31 19:01:59 +00:00
|
|
|
if (s.IsNotFound()) {
|
2024-05-14 23:33:44 +00:00
|
|
|
fprintf(
|
|
|
|
stderr,
|
|
|
|
"Inconsistent results for key %s: MultiGetEntity returned "
|
2024-05-31 19:01:59 +00:00
|
|
|
"not found, GetEntity returned ok\n",
|
|
|
|
StringToHex(keys[i]).c_str());
|
2024-05-14 23:33:44 +00:00
|
|
|
is_consistent = false;
|
2024-05-31 19:01:59 +00:00
|
|
|
} else {
|
|
|
|
assert(s.ok());
|
|
|
|
|
|
|
|
const WideColumns& cmp_columns = cmp_result.columns();
|
|
|
|
|
|
|
|
if (columns != cmp_columns) {
|
|
|
|
fprintf(stderr,
|
|
|
|
"Inconsistent results for key %s: MultiGetEntity "
|
|
|
|
"returned "
|
|
|
|
"%s, GetEntity returned %s\n",
|
|
|
|
StringToHex(keys[i]).c_str(),
|
|
|
|
WideColumnsToHex(columns).c_str(),
|
|
|
|
WideColumnsToHex(cmp_columns).c_str());
|
|
|
|
is_consistent = false;
|
|
|
|
}
|
2024-05-14 23:33:44 +00:00
|
|
|
}
|
2023-03-30 03:35:15 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2024-05-30 20:31:25 +00:00
|
|
|
|
2024-05-14 23:33:44 +00:00
|
|
|
if (!is_consistent) {
|
2024-05-30 20:31:25 +00:00
|
|
|
fprintf(stderr,
|
|
|
|
"TestMultiGetEntity error: results are not consistent\n");
|
|
|
|
thread->stats.AddErrors(1);
|
|
|
|
// Fail fast to preserve the DB state
|
|
|
|
thread->shared->SetVerificationFailure();
|
2024-05-14 23:33:44 +00:00
|
|
|
break;
|
2024-05-30 20:31:25 +00:00
|
|
|
} else if (s.ok()) {
|
|
|
|
thread->stats.AddGets(1, 1);
|
|
|
|
} else if (s.IsNotFound()) {
|
|
|
|
thread->stats.AddGets(1, 0);
|
2024-06-25 03:51:39 +00:00
|
|
|
} else if (injected_error_count == 0 ||
|
|
|
|
!IsErrorInjectedAndRetryable(s)) {
|
|
|
|
fprintf(stderr, "MultiGetEntity error: %s\n", s.ToString().c_str());
|
|
|
|
thread->stats.AddErrors(1);
|
|
|
|
thread->shared->SetVerificationFailure();
|
2023-03-30 03:35:15 +00:00
|
|
|
}
|
|
|
|
}
|
Inject more errors to more files in stress test (#12713)
Summary:
**Context:**
We currently have partial error injection:
- DB operation: all read, SST write
- DB open: all read, SST write, all metadata write.
This PR completes the error injection (with some limitations below):
- DB operation & open: all read, all write, all metadata write, all metadata read
**Summary:**
- Inject retryable metadata read, metadata write error concerning directory (e.g, dir sync, ) or file metadata (e.g, name, size, file creation/deletion...)
- Inject retryable errors to all major file types: random access file, sequential file, writable file
- Allow db stress test operations to handle above injected errors gracefully without crashing
- Change all error injection to thread-local implementation for easier disabling and enabling in the same thread. For example, we can control error handling thread to have no error injection. It's also cleaner in code.
- Limitation: compared to before, we now don't have write fault injection for backup/restore CopyOrCreateFiles work threads since they use anonymous background threads as well as read injection for db open bg thread
- Add a new flag to test error recovery without error injection so we can test the path where error recovery actually succeeds
- Some Refactory & fix to db stress test framework (see PR review comments)
- Fix some minor bugs surfaced (see PR review comments)
- Limitation: had to disable backup restore with metadata read/write injection since it surfaces too many testing issues. Will add it back later to focus on surfacing actual code/internal bugs first.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12713
Test Plan:
- Existing UT
- CI with no trivial error failure
Reviewed By: pdillinger
Differential Revision: D58326608
Pulled By: hx235
fbshipit-source-id: 011b5195aaeb6011641ae0a9194f7f2a0e325ad7
2024-06-19 15:42:00 +00:00
|
|
|
// Enable back error injection disbled for checking results
|
|
|
|
if (fault_fs_guard) {
|
|
|
|
fault_fs_guard->EnableThreadLocalErrorInjection(
|
|
|
|
FaultInjectionIOType::kRead);
|
|
|
|
fault_fs_guard->EnableThreadLocalErrorInjection(
|
|
|
|
FaultInjectionIOType::kMetadataRead);
|
|
|
|
}
|
2024-05-30 20:31:25 +00:00
|
|
|
};
|
|
|
|
|
2024-05-31 19:01:59 +00:00
|
|
|
if (FLAGS_use_txn) {
|
|
|
|
// Transactional/read-your-own-writes MultiGetEntity verification
|
|
|
|
std::vector<PinnableWideColumns> results(num_keys);
|
|
|
|
std::vector<Status> statuses(num_keys);
|
|
|
|
|
|
|
|
assert(txn);
|
|
|
|
txn->MultiGetEntity(read_opts_copy, cfh, num_keys, key_slices.data(),
|
|
|
|
results.data(), statuses.data());
|
|
|
|
|
|
|
|
auto ryw_check = [&](const std::string& key, const WideColumns& columns,
|
|
|
|
const Status& s) -> bool {
|
|
|
|
const auto it = ryw_expected_values.find(key);
|
|
|
|
if (it == ryw_expected_values.end()) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
const auto& ryw_expected_value = it->second;
|
|
|
|
|
|
|
|
if (s.ok()) {
|
|
|
|
if (ryw_expected_value.IsDeleted()) {
|
|
|
|
fprintf(
|
|
|
|
stderr,
|
|
|
|
"MultiGetEntity failed the read-your-own-write check for key "
|
|
|
|
"%s\n",
|
|
|
|
Slice(key).ToString(true).c_str());
|
|
|
|
fprintf(stderr,
|
|
|
|
"MultiGetEntity returned ok, transaction has non-committed "
|
|
|
|
"delete\n");
|
|
|
|
return false;
|
|
|
|
} else {
|
|
|
|
const uint32_t value_base = ryw_expected_value.GetValueBase();
|
|
|
|
char expected_value[100];
|
|
|
|
const size_t sz = GenerateValue(value_base, expected_value,
|
|
|
|
sizeof(expected_value));
|
|
|
|
const Slice expected_slice(expected_value, sz);
|
|
|
|
const WideColumns expected_columns =
|
|
|
|
GenerateExpectedWideColumns(value_base, expected_slice);
|
|
|
|
|
|
|
|
if (columns != expected_columns) {
|
|
|
|
fprintf(
|
|
|
|
stderr,
|
|
|
|
"MultiGetEntity failed the read-your-own-write check for key "
|
|
|
|
"%s\n",
|
|
|
|
Slice(key).ToString(true).c_str());
|
|
|
|
fprintf(stderr, "MultiGetEntity returned %s\n",
|
|
|
|
WideColumnsToHex(columns).c_str());
|
|
|
|
fprintf(stderr, "Transaction has non-committed write %s\n",
|
|
|
|
WideColumnsToHex(expected_columns).c_str());
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
assert(s.IsNotFound());
|
|
|
|
if (!ryw_expected_value.IsDeleted()) {
|
|
|
|
fprintf(stderr,
|
|
|
|
"MultiGetEntity failed the read-your-own-write check for key "
|
|
|
|
"%s\n",
|
|
|
|
Slice(key).ToString(true).c_str());
|
|
|
|
fprintf(stderr,
|
|
|
|
"MultiGetEntity returned not found, transaction has "
|
|
|
|
"non-committed write\n");
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
};
|
|
|
|
|
|
|
|
check_results([&](size_t i) { return results[i].columns(); },
|
|
|
|
[&](size_t i) { return statuses[i]; }, ryw_check,
|
|
|
|
[&](const Slice& key, PinnableWideColumns* result) {
|
|
|
|
return txn->GetEntity(read_opts_copy, cfh, key, result);
|
|
|
|
});
|
|
|
|
|
|
|
|
txn->Rollback().PermitUncheckedError();
|
Inject more errors to more files in stress test (#12713)
Summary:
**Context:**
We currently have partial error injection:
- DB operation: all read, SST write
- DB open: all read, SST write, all metadata write.
This PR completes the error injection (with some limitations below):
- DB operation & open: all read, all write, all metadata write, all metadata read
**Summary:**
- Inject retryable metadata read, metadata write error concerning directory (e.g, dir sync, ) or file metadata (e.g, name, size, file creation/deletion...)
- Inject retryable errors to all major file types: random access file, sequential file, writable file
- Allow db stress test operations to handle above injected errors gracefully without crashing
- Change all error injection to thread-local implementation for easier disabling and enabling in the same thread. For example, we can control error handling thread to have no error injection. It's also cleaner in code.
- Limitation: compared to before, we now don't have write fault injection for backup/restore CopyOrCreateFiles work threads since they use anonymous background threads as well as read injection for db open bg thread
- Add a new flag to test error recovery without error injection so we can test the path where error recovery actually succeeds
- Some Refactory & fix to db stress test framework (see PR review comments)
- Fix some minor bugs surfaced (see PR review comments)
- Limitation: had to disable backup restore with metadata read/write injection since it surfaces too many testing issues. Will add it back later to focus on surfacing actual code/internal bugs first.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12713
Test Plan:
- Existing UT
- CI with no trivial error failure
Reviewed By: pdillinger
Differential Revision: D58326608
Pulled By: hx235
fbshipit-source-id: 011b5195aaeb6011641ae0a9194f7f2a0e325ad7
2024-06-19 15:42:00 +00:00
|
|
|
// Enable back error injection disbled for transactions
|
|
|
|
if (fault_fs_guard) {
|
|
|
|
fault_fs_guard->EnableThreadLocalErrorInjection(
|
|
|
|
FaultInjectionIOType::kRead);
|
|
|
|
fault_fs_guard->EnableThreadLocalErrorInjection(
|
|
|
|
FaultInjectionIOType::kMetadataRead);
|
|
|
|
}
|
2024-05-31 19:01:59 +00:00
|
|
|
} else if (FLAGS_use_attribute_group) {
|
2024-05-30 20:31:25 +00:00
|
|
|
// AttributeGroup MultiGetEntity verification
|
|
|
|
|
2024-05-31 19:01:59 +00:00
|
|
|
if (fault_fs_guard) {
|
Inject more errors to more files in stress test (#12713)
Summary:
**Context:**
We currently have partial error injection:
- DB operation: all read, SST write
- DB open: all read, SST write, all metadata write.
This PR completes the error injection (with some limitations below):
- DB operation & open: all read, all write, all metadata write, all metadata read
**Summary:**
- Inject retryable metadata read, metadata write error concerning directory (e.g, dir sync, ) or file metadata (e.g, name, size, file creation/deletion...)
- Inject retryable errors to all major file types: random access file, sequential file, writable file
- Allow db stress test operations to handle above injected errors gracefully without crashing
- Change all error injection to thread-local implementation for easier disabling and enabling in the same thread. For example, we can control error handling thread to have no error injection. It's also cleaner in code.
- Limitation: compared to before, we now don't have write fault injection for backup/restore CopyOrCreateFiles work threads since they use anonymous background threads as well as read injection for db open bg thread
- Add a new flag to test error recovery without error injection so we can test the path where error recovery actually succeeds
- Some Refactory & fix to db stress test framework (see PR review comments)
- Fix some minor bugs surfaced (see PR review comments)
- Limitation: had to disable backup restore with metadata read/write injection since it surfaces too many testing issues. Will add it back later to focus on surfacing actual code/internal bugs first.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12713
Test Plan:
- Existing UT
- CI with no trivial error failure
Reviewed By: pdillinger
Differential Revision: D58326608
Pulled By: hx235
fbshipit-source-id: 011b5195aaeb6011641ae0a9194f7f2a0e325ad7
2024-06-19 15:42:00 +00:00
|
|
|
fault_fs_guard->GetAndResetInjectedThreadLocalErrorCount(
|
|
|
|
FaultInjectionIOType::kRead);
|
|
|
|
fault_fs_guard->GetAndResetInjectedThreadLocalErrorCount(
|
|
|
|
FaultInjectionIOType::kMetadataRead);
|
2024-06-22 03:11:57 +00:00
|
|
|
SharedState::ignore_read_error = false;
|
2024-05-31 19:01:59 +00:00
|
|
|
}
|
|
|
|
|
2024-05-30 20:31:25 +00:00
|
|
|
std::vector<PinnableAttributeGroups> results;
|
|
|
|
results.reserve(num_keys);
|
|
|
|
for (size_t i = 0; i < num_keys; ++i) {
|
|
|
|
PinnableAttributeGroups attribute_groups;
|
|
|
|
attribute_groups.emplace_back(cfh);
|
|
|
|
results.emplace_back(std::move(attribute_groups));
|
|
|
|
}
|
|
|
|
|
|
|
|
db_->MultiGetEntity(read_opts_copy, num_keys, key_slices.data(),
|
|
|
|
results.data());
|
|
|
|
|
2024-06-24 04:54:27 +00:00
|
|
|
if (fault_fs_guard) {
|
2024-05-30 20:31:25 +00:00
|
|
|
verify_expected_errors(
|
|
|
|
[&](size_t i) { return results[i][0].status(); });
|
|
|
|
}
|
|
|
|
|
|
|
|
// Compare against non-attribute-group GetEntity result
|
|
|
|
check_results([&](size_t i) { return results[i][0].columns(); },
|
2024-05-31 19:01:59 +00:00
|
|
|
[&](size_t i) { return results[i][0].status(); },
|
|
|
|
[](const Slice& /* key */, const WideColumns& /* columns */,
|
|
|
|
const Status& /* s */) { return true; },
|
|
|
|
[&](const Slice& key, PinnableWideColumns* result) {
|
|
|
|
return db_->GetEntity(read_opts_copy, cfh, key, result);
|
|
|
|
});
|
2024-05-30 20:31:25 +00:00
|
|
|
} else {
|
|
|
|
// Non-AttributeGroup MultiGetEntity verification
|
|
|
|
|
2024-05-31 19:01:59 +00:00
|
|
|
if (fault_fs_guard) {
|
Inject more errors to more files in stress test (#12713)
Summary:
**Context:**
We currently have partial error injection:
- DB operation: all read, SST write
- DB open: all read, SST write, all metadata write.
This PR completes the error injection (with some limitations below):
- DB operation & open: all read, all write, all metadata write, all metadata read
**Summary:**
- Inject retryable metadata read, metadata write error concerning directory (e.g, dir sync, ) or file metadata (e.g, name, size, file creation/deletion...)
- Inject retryable errors to all major file types: random access file, sequential file, writable file
- Allow db stress test operations to handle above injected errors gracefully without crashing
- Change all error injection to thread-local implementation for easier disabling and enabling in the same thread. For example, we can control error handling thread to have no error injection. It's also cleaner in code.
- Limitation: compared to before, we now don't have write fault injection for backup/restore CopyOrCreateFiles work threads since they use anonymous background threads as well as read injection for db open bg thread
- Add a new flag to test error recovery without error injection so we can test the path where error recovery actually succeeds
- Some Refactory & fix to db stress test framework (see PR review comments)
- Fix some minor bugs surfaced (see PR review comments)
- Limitation: had to disable backup restore with metadata read/write injection since it surfaces too many testing issues. Will add it back later to focus on surfacing actual code/internal bugs first.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12713
Test Plan:
- Existing UT
- CI with no trivial error failure
Reviewed By: pdillinger
Differential Revision: D58326608
Pulled By: hx235
fbshipit-source-id: 011b5195aaeb6011641ae0a9194f7f2a0e325ad7
2024-06-19 15:42:00 +00:00
|
|
|
fault_fs_guard->GetAndResetInjectedThreadLocalErrorCount(
|
|
|
|
FaultInjectionIOType::kRead);
|
|
|
|
fault_fs_guard->GetAndResetInjectedThreadLocalErrorCount(
|
|
|
|
FaultInjectionIOType::kMetadataRead);
|
2024-06-22 03:11:57 +00:00
|
|
|
SharedState::ignore_read_error = false;
|
2024-05-31 19:01:59 +00:00
|
|
|
}
|
|
|
|
|
2024-05-30 20:31:25 +00:00
|
|
|
std::vector<PinnableWideColumns> results(num_keys);
|
|
|
|
std::vector<Status> statuses(num_keys);
|
|
|
|
|
|
|
|
db_->MultiGetEntity(read_opts_copy, cfh, num_keys, key_slices.data(),
|
|
|
|
results.data(), statuses.data());
|
|
|
|
|
2024-06-24 04:54:27 +00:00
|
|
|
if (fault_fs_guard) {
|
2024-05-30 20:31:25 +00:00
|
|
|
verify_expected_errors([&](size_t i) { return statuses[i]; });
|
|
|
|
}
|
|
|
|
|
|
|
|
check_results([&](size_t i) { return results[i].columns(); },
|
2024-05-31 19:01:59 +00:00
|
|
|
[&](size_t i) { return statuses[i]; },
|
|
|
|
[](const Slice& /* key */, const WideColumns& /* columns */,
|
|
|
|
const Status& /* s */) { return true; },
|
|
|
|
[&](const Slice& key, PinnableWideColumns* result) {
|
|
|
|
return db_->GetEntity(read_opts_copy, cfh, key, result);
|
|
|
|
});
|
2023-03-30 03:35:15 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-12-09 22:36:10 +00:00
|
|
|
Status TestPrefixScan(ThreadState* thread, const ReadOptions& read_opts,
|
|
|
|
const std::vector<int>& rand_column_families,
|
|
|
|
const std::vector<int64_t>& rand_keys) override {
|
2022-10-07 18:17:57 +00:00
|
|
|
assert(!rand_column_families.empty());
|
|
|
|
assert(!rand_keys.empty());
|
|
|
|
|
|
|
|
ColumnFamilyHandle* const cfh = column_families_[rand_column_families[0]];
|
|
|
|
assert(cfh);
|
|
|
|
|
|
|
|
const std::string key = Key(rand_keys[0]);
|
|
|
|
const Slice prefix(key.data(), FLAGS_prefix_size);
|
2019-12-09 07:49:32 +00:00
|
|
|
|
|
|
|
std::string upper_bound;
|
|
|
|
Slice ub_slice;
|
|
|
|
ReadOptions ro_copy = read_opts;
|
2022-10-07 18:17:57 +00:00
|
|
|
|
2024-10-17 22:52:55 +00:00
|
|
|
// Randomly test with `iterate_upper_bound` and `prefix_same_as_start`
|
|
|
|
//
|
|
|
|
// Get the next prefix first and then see if we want to set it to be the
|
|
|
|
// upper bound. We'll use the next prefix in an assertion later on
|
2020-01-10 05:25:40 +00:00
|
|
|
if (GetNextPrefix(prefix, &upper_bound) && thread->rand.OneIn(2)) {
|
2019-12-09 07:49:32 +00:00
|
|
|
// For half of the time, set the upper bound to the next prefix
|
|
|
|
ub_slice = Slice(upper_bound);
|
|
|
|
ro_copy.iterate_upper_bound = &ub_slice;
|
2024-06-18 23:16:09 +00:00
|
|
|
if (FLAGS_use_sqfc_for_range_queries) {
|
|
|
|
ro_copy.table_filter =
|
|
|
|
sqfc_factory_->GetTableFilterForRangeQuery(prefix, ub_slice);
|
|
|
|
}
|
2024-10-17 22:52:55 +00:00
|
|
|
} else if (options_.prefix_extractor && thread->rand.OneIn(2)) {
|
|
|
|
ro_copy.prefix_same_as_start = true;
|
2019-12-09 07:49:32 +00:00
|
|
|
}
|
|
|
|
|
2022-07-05 20:30:15 +00:00
|
|
|
std::string read_ts_str;
|
|
|
|
Slice read_ts_slice;
|
|
|
|
MaybeUseOlderTimestampForRangeScan(thread, read_ts_str, read_ts_slice,
|
|
|
|
ro_copy);
|
|
|
|
|
2022-12-15 23:48:50 +00:00
|
|
|
if (fault_fs_guard) {
|
Inject more errors to more files in stress test (#12713)
Summary:
**Context:**
We currently have partial error injection:
- DB operation: all read, SST write
- DB open: all read, SST write, all metadata write.
This PR completes the error injection (with some limitations below):
- DB operation & open: all read, all write, all metadata write, all metadata read
**Summary:**
- Inject retryable metadata read, metadata write error concerning directory (e.g, dir sync, ) or file metadata (e.g, name, size, file creation/deletion...)
- Inject retryable errors to all major file types: random access file, sequential file, writable file
- Allow db stress test operations to handle above injected errors gracefully without crashing
- Change all error injection to thread-local implementation for easier disabling and enabling in the same thread. For example, we can control error handling thread to have no error injection. It's also cleaner in code.
- Limitation: compared to before, we now don't have write fault injection for backup/restore CopyOrCreateFiles work threads since they use anonymous background threads as well as read injection for db open bg thread
- Add a new flag to test error recovery without error injection so we can test the path where error recovery actually succeeds
- Some Refactory & fix to db stress test framework (see PR review comments)
- Fix some minor bugs surfaced (see PR review comments)
- Limitation: had to disable backup restore with metadata read/write injection since it surfaces too many testing issues. Will add it back later to focus on surfacing actual code/internal bugs first.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12713
Test Plan:
- Existing UT
- CI with no trivial error failure
Reviewed By: pdillinger
Differential Revision: D58326608
Pulled By: hx235
fbshipit-source-id: 011b5195aaeb6011641ae0a9194f7f2a0e325ad7
2024-06-19 15:42:00 +00:00
|
|
|
fault_fs_guard->GetAndResetInjectedThreadLocalErrorCount(
|
|
|
|
FaultInjectionIOType::kRead);
|
|
|
|
fault_fs_guard->GetAndResetInjectedThreadLocalErrorCount(
|
|
|
|
FaultInjectionIOType::kMetadataRead);
|
2024-06-22 03:11:57 +00:00
|
|
|
SharedState::ignore_read_error = false;
|
2022-12-15 23:48:50 +00:00
|
|
|
}
|
|
|
|
|
2024-06-24 04:54:27 +00:00
|
|
|
std::unique_ptr<Iterator> iter(db_->NewIterator(ro_copy, cfh));
|
|
|
|
|
|
|
|
uint64_t count = 0;
|
|
|
|
Status s;
|
|
|
|
|
2024-10-17 22:52:55 +00:00
|
|
|
for (iter->Seek(prefix); iter->Valid(); iter->Next()) {
|
|
|
|
// If upper or prefix bounds is specified, only keys of the target
|
|
|
|
// prefix should show up. Otherwise, we need to manual exit the loop when
|
|
|
|
// we see the first key that is not in the target prefix show up.
|
|
|
|
if (ro_copy.iterate_upper_bound != nullptr ||
|
|
|
|
ro_copy.prefix_same_as_start) {
|
|
|
|
assert(iter->key().starts_with(prefix));
|
|
|
|
} else if (!iter->key().starts_with(prefix)) {
|
|
|
|
break;
|
|
|
|
}
|
2019-12-09 07:49:32 +00:00
|
|
|
++count;
|
2022-10-07 18:17:57 +00:00
|
|
|
|
2022-10-10 22:07:07 +00:00
|
|
|
// When iter_start_ts is set, iterator exposes internal keys, including
|
|
|
|
// tombstones; however, we want to perform column validation only for
|
|
|
|
// value-like types.
|
|
|
|
if (ro_copy.iter_start_ts) {
|
|
|
|
const ValueType value_type = ExtractValueType(iter->key());
|
|
|
|
if (value_type != kTypeValue && value_type != kTypeBlobIndex &&
|
|
|
|
value_type != kTypeWideColumnEntity) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-11-08 05:24:21 +00:00
|
|
|
if (ro_copy.allow_unprepared_value) {
|
|
|
|
if (!iter->PrepareValue()) {
|
|
|
|
s = iter->status();
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-03-17 21:47:29 +00:00
|
|
|
if (!VerifyWideColumns(iter->value(), iter->columns())) {
|
|
|
|
s = Status::Corruption("Value and columns inconsistent",
|
|
|
|
DebugString(iter->value(), iter->columns()));
|
2022-10-07 18:17:57 +00:00
|
|
|
break;
|
|
|
|
}
|
2019-12-09 07:49:32 +00:00
|
|
|
}
|
2020-01-10 05:25:40 +00:00
|
|
|
|
2022-07-05 20:30:15 +00:00
|
|
|
if (ro_copy.iter_start_ts == nullptr) {
|
|
|
|
assert(count <= GetPrefixKeyCount(prefix.ToString(), upper_bound));
|
|
|
|
}
|
2020-01-10 05:25:40 +00:00
|
|
|
|
2022-10-07 18:17:57 +00:00
|
|
|
if (s.ok()) {
|
|
|
|
s = iter->status();
|
|
|
|
}
|
|
|
|
|
Inject more errors to more files in stress test (#12713)
Summary:
**Context:**
We currently have partial error injection:
- DB operation: all read, SST write
- DB open: all read, SST write, all metadata write.
This PR completes the error injection (with some limitations below):
- DB operation & open: all read, all write, all metadata write, all metadata read
**Summary:**
- Inject retryable metadata read, metadata write error concerning directory (e.g, dir sync, ) or file metadata (e.g, name, size, file creation/deletion...)
- Inject retryable errors to all major file types: random access file, sequential file, writable file
- Allow db stress test operations to handle above injected errors gracefully without crashing
- Change all error injection to thread-local implementation for easier disabling and enabling in the same thread. For example, we can control error handling thread to have no error injection. It's also cleaner in code.
- Limitation: compared to before, we now don't have write fault injection for backup/restore CopyOrCreateFiles work threads since they use anonymous background threads as well as read injection for db open bg thread
- Add a new flag to test error recovery without error injection so we can test the path where error recovery actually succeeds
- Some Refactory & fix to db stress test framework (see PR review comments)
- Fix some minor bugs surfaced (see PR review comments)
- Limitation: had to disable backup restore with metadata read/write injection since it surfaces too many testing issues. Will add it back later to focus on surfacing actual code/internal bugs first.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12713
Test Plan:
- Existing UT
- CI with no trivial error failure
Reviewed By: pdillinger
Differential Revision: D58326608
Pulled By: hx235
fbshipit-source-id: 011b5195aaeb6011641ae0a9194f7f2a0e325ad7
2024-06-19 15:42:00 +00:00
|
|
|
int injected_error_count = 0;
|
2024-06-24 04:54:27 +00:00
|
|
|
if (fault_fs_guard) {
|
Inject more errors to more files in stress test (#12713)
Summary:
**Context:**
We currently have partial error injection:
- DB operation: all read, SST write
- DB open: all read, SST write, all metadata write.
This PR completes the error injection (with some limitations below):
- DB operation & open: all read, all write, all metadata write, all metadata read
**Summary:**
- Inject retryable metadata read, metadata write error concerning directory (e.g, dir sync, ) or file metadata (e.g, name, size, file creation/deletion...)
- Inject retryable errors to all major file types: random access file, sequential file, writable file
- Allow db stress test operations to handle above injected errors gracefully without crashing
- Change all error injection to thread-local implementation for easier disabling and enabling in the same thread. For example, we can control error handling thread to have no error injection. It's also cleaner in code.
- Limitation: compared to before, we now don't have write fault injection for backup/restore CopyOrCreateFiles work threads since they use anonymous background threads as well as read injection for db open bg thread
- Add a new flag to test error recovery without error injection so we can test the path where error recovery actually succeeds
- Some Refactory & fix to db stress test framework (see PR review comments)
- Fix some minor bugs surfaced (see PR review comments)
- Limitation: had to disable backup restore with metadata read/write injection since it surfaces too many testing issues. Will add it back later to focus on surfacing actual code/internal bugs first.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12713
Test Plan:
- Existing UT
- CI with no trivial error failure
Reviewed By: pdillinger
Differential Revision: D58326608
Pulled By: hx235
fbshipit-source-id: 011b5195aaeb6011641ae0a9194f7f2a0e325ad7
2024-06-19 15:42:00 +00:00
|
|
|
injected_error_count = GetMinInjectedErrorCount(
|
|
|
|
fault_fs_guard->GetAndResetInjectedThreadLocalErrorCount(
|
|
|
|
FaultInjectionIOType::kRead),
|
|
|
|
fault_fs_guard->GetAndResetInjectedThreadLocalErrorCount(
|
|
|
|
FaultInjectionIOType::kMetadataRead));
|
2024-06-24 04:54:27 +00:00
|
|
|
if (!SharedState::ignore_read_error && injected_error_count > 0 &&
|
|
|
|
s.ok()) {
|
Inject more errors to more files in stress test (#12713)
Summary:
**Context:**
We currently have partial error injection:
- DB operation: all read, SST write
- DB open: all read, SST write, all metadata write.
This PR completes the error injection (with some limitations below):
- DB operation & open: all read, all write, all metadata write, all metadata read
**Summary:**
- Inject retryable metadata read, metadata write error concerning directory (e.g, dir sync, ) or file metadata (e.g, name, size, file creation/deletion...)
- Inject retryable errors to all major file types: random access file, sequential file, writable file
- Allow db stress test operations to handle above injected errors gracefully without crashing
- Change all error injection to thread-local implementation for easier disabling and enabling in the same thread. For example, we can control error handling thread to have no error injection. It's also cleaner in code.
- Limitation: compared to before, we now don't have write fault injection for backup/restore CopyOrCreateFiles work threads since they use anonymous background threads as well as read injection for db open bg thread
- Add a new flag to test error recovery without error injection so we can test the path where error recovery actually succeeds
- Some Refactory & fix to db stress test framework (see PR review comments)
- Fix some minor bugs surfaced (see PR review comments)
- Limitation: had to disable backup restore with metadata read/write injection since it surfaces too many testing issues. Will add it back later to focus on surfacing actual code/internal bugs first.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12713
Test Plan:
- Existing UT
- CI with no trivial error failure
Reviewed By: pdillinger
Differential Revision: D58326608
Pulled By: hx235
fbshipit-source-id: 011b5195aaeb6011641ae0a9194f7f2a0e325ad7
2024-06-19 15:42:00 +00:00
|
|
|
// Grab mutex so multiple thread don't try to print the
|
|
|
|
// stack trace at the same time
|
|
|
|
MutexLock l(thread->shared->GetMutex());
|
|
|
|
fprintf(stderr, "Didn't get expected error from PrefixScan\n");
|
|
|
|
fprintf(stderr, "Callstack that injected the fault\n");
|
|
|
|
fault_fs_guard->PrintInjectedThreadLocalErrorBacktrace(
|
|
|
|
FaultInjectionIOType::kRead);
|
|
|
|
fault_fs_guard->PrintInjectedThreadLocalErrorBacktrace(
|
|
|
|
FaultInjectionIOType::kMetadataRead);
|
|
|
|
std::terminate();
|
|
|
|
}
|
2019-12-09 07:49:32 +00:00
|
|
|
}
|
2022-10-07 18:17:57 +00:00
|
|
|
|
Inject more errors to more files in stress test (#12713)
Summary:
**Context:**
We currently have partial error injection:
- DB operation: all read, SST write
- DB open: all read, SST write, all metadata write.
This PR completes the error injection (with some limitations below):
- DB operation & open: all read, all write, all metadata write, all metadata read
**Summary:**
- Inject retryable metadata read, metadata write error concerning directory (e.g, dir sync, ) or file metadata (e.g, name, size, file creation/deletion...)
- Inject retryable errors to all major file types: random access file, sequential file, writable file
- Allow db stress test operations to handle above injected errors gracefully without crashing
- Change all error injection to thread-local implementation for easier disabling and enabling in the same thread. For example, we can control error handling thread to have no error injection. It's also cleaner in code.
- Limitation: compared to before, we now don't have write fault injection for backup/restore CopyOrCreateFiles work threads since they use anonymous background threads as well as read injection for db open bg thread
- Add a new flag to test error recovery without error injection so we can test the path where error recovery actually succeeds
- Some Refactory & fix to db stress test framework (see PR review comments)
- Fix some minor bugs surfaced (see PR review comments)
- Limitation: had to disable backup restore with metadata read/write injection since it surfaces too many testing issues. Will add it back later to focus on surfacing actual code/internal bugs first.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12713
Test Plan:
- Existing UT
- CI with no trivial error failure
Reviewed By: pdillinger
Differential Revision: D58326608
Pulled By: hx235
fbshipit-source-id: 011b5195aaeb6011641ae0a9194f7f2a0e325ad7
2024-06-19 15:42:00 +00:00
|
|
|
if (s.ok()) {
|
|
|
|
thread->stats.AddPrefixes(1, count);
|
2024-06-25 03:51:39 +00:00
|
|
|
} else if (injected_error_count == 0 || !IsErrorInjectedAndRetryable(s)) {
|
2024-10-17 22:52:55 +00:00
|
|
|
fprintf(stderr,
|
|
|
|
"TestPrefixScan error: %s with ReadOptions::iterate_upper_bound: "
|
|
|
|
"%s, prefix_same_as_start: %s \n",
|
|
|
|
s.ToString().c_str(),
|
|
|
|
ro_copy.iterate_upper_bound
|
|
|
|
? ro_copy.iterate_upper_bound->ToString(true).c_str()
|
|
|
|
: "nullptr",
|
|
|
|
ro_copy.prefix_same_as_start ? "true" : "false");
|
Inject more errors to more files in stress test (#12713)
Summary:
**Context:**
We currently have partial error injection:
- DB operation: all read, SST write
- DB open: all read, SST write, all metadata write.
This PR completes the error injection (with some limitations below):
- DB operation & open: all read, all write, all metadata write, all metadata read
**Summary:**
- Inject retryable metadata read, metadata write error concerning directory (e.g, dir sync, ) or file metadata (e.g, name, size, file creation/deletion...)
- Inject retryable errors to all major file types: random access file, sequential file, writable file
- Allow db stress test operations to handle above injected errors gracefully without crashing
- Change all error injection to thread-local implementation for easier disabling and enabling in the same thread. For example, we can control error handling thread to have no error injection. It's also cleaner in code.
- Limitation: compared to before, we now don't have write fault injection for backup/restore CopyOrCreateFiles work threads since they use anonymous background threads as well as read injection for db open bg thread
- Add a new flag to test error recovery without error injection so we can test the path where error recovery actually succeeds
- Some Refactory & fix to db stress test framework (see PR review comments)
- Fix some minor bugs surfaced (see PR review comments)
- Limitation: had to disable backup restore with metadata read/write injection since it surfaces too many testing issues. Will add it back later to focus on surfacing actual code/internal bugs first.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12713
Test Plan:
- Existing UT
- CI with no trivial error failure
Reviewed By: pdillinger
Differential Revision: D58326608
Pulled By: hx235
fbshipit-source-id: 011b5195aaeb6011641ae0a9194f7f2a0e325ad7
2024-06-19 15:42:00 +00:00
|
|
|
thread->shared->SetVerificationFailure();
|
2022-12-15 23:48:50 +00:00
|
|
|
}
|
2022-10-07 18:17:57 +00:00
|
|
|
|
Inject more errors to more files in stress test (#12713)
Summary:
**Context:**
We currently have partial error injection:
- DB operation: all read, SST write
- DB open: all read, SST write, all metadata write.
This PR completes the error injection (with some limitations below):
- DB operation & open: all read, all write, all metadata write, all metadata read
**Summary:**
- Inject retryable metadata read, metadata write error concerning directory (e.g, dir sync, ) or file metadata (e.g, name, size, file creation/deletion...)
- Inject retryable errors to all major file types: random access file, sequential file, writable file
- Allow db stress test operations to handle above injected errors gracefully without crashing
- Change all error injection to thread-local implementation for easier disabling and enabling in the same thread. For example, we can control error handling thread to have no error injection. It's also cleaner in code.
- Limitation: compared to before, we now don't have write fault injection for backup/restore CopyOrCreateFiles work threads since they use anonymous background threads as well as read injection for db open bg thread
- Add a new flag to test error recovery without error injection so we can test the path where error recovery actually succeeds
- Some Refactory & fix to db stress test framework (see PR review comments)
- Fix some minor bugs surfaced (see PR review comments)
- Limitation: had to disable backup restore with metadata read/write injection since it surfaces too many testing issues. Will add it back later to focus on surfacing actual code/internal bugs first.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12713
Test Plan:
- Existing UT
- CI with no trivial error failure
Reviewed By: pdillinger
Differential Revision: D58326608
Pulled By: hx235
fbshipit-source-id: 011b5195aaeb6011641ae0a9194f7f2a0e325ad7
2024-06-19 15:42:00 +00:00
|
|
|
return s;
|
2019-12-09 07:49:32 +00:00
|
|
|
}
|
|
|
|
|
2019-12-09 22:36:10 +00:00
|
|
|
Status TestPut(ThreadState* thread, WriteOptions& write_opts,
|
|
|
|
const ReadOptions& read_opts,
|
|
|
|
const std::vector<int>& rand_column_families,
|
2022-09-15 22:55:37 +00:00
|
|
|
const std::vector<int64_t>& rand_keys,
|
|
|
|
char (&value)[100]) override {
|
Add the PutEntity API to the stress/crash tests (#10760)
Summary:
The patch adds the `PutEntity` API to the non-batched, batched, and
CF consistency stress tests. Namely, when the new `db_stress` command
line parameter `use_put_entity_one_in` is greater than zero, one in
N writes on average is performed using `PutEntity` rather than `Put`.
The wide-column entity written has the generated value in its default
column; in addition, it contains up to three additional columns where
the original generated value is divided up between the column name and the
column value (with the column name containing the first k characters of
the generated value, and the column value containing the rest). Whether
`PutEntity` is used (and if so, how many columns the entity has) is completely
determined by the "value base" used to generate the value (that is, there is
no randomness involved). Assuming the same `use_put_entity_one_in` setting
is used across `db_stress` invocations, this enables us to reconstruct and
validate the entity during subsequent `db_stress` runs.
Note that `PutEntity` is currently incompatible with `Merge`, transactions, and
user-defined timestamps; these combinations are currently disabled/disallowed.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10760
Test Plan: Ran some batched, non-batched, and CF consistency stress tests using the script.
Reviewed By: riversand963
Differential Revision: D39939032
Pulled By: ltamasi
fbshipit-source-id: eafdf124e95993fb7d73158e3b006d11819f7fa9
2022-09-30 18:11:07 +00:00
|
|
|
assert(!rand_column_families.empty());
|
|
|
|
assert(!rand_keys.empty());
|
|
|
|
|
2019-12-09 07:49:32 +00:00
|
|
|
auto shared = thread->shared;
|
Add the PutEntity API to the stress/crash tests (#10760)
Summary:
The patch adds the `PutEntity` API to the non-batched, batched, and
CF consistency stress tests. Namely, when the new `db_stress` command
line parameter `use_put_entity_one_in` is greater than zero, one in
N writes on average is performed using `PutEntity` rather than `Put`.
The wide-column entity written has the generated value in its default
column; in addition, it contains up to three additional columns where
the original generated value is divided up between the column name and the
column value (with the column name containing the first k characters of
the generated value, and the column value containing the rest). Whether
`PutEntity` is used (and if so, how many columns the entity has) is completely
determined by the "value base" used to generate the value (that is, there is
no randomness involved). Assuming the same `use_put_entity_one_in` setting
is used across `db_stress` invocations, this enables us to reconstruct and
validate the entity during subsequent `db_stress` runs.
Note that `PutEntity` is currently incompatible with `Merge`, transactions, and
user-defined timestamps; these combinations are currently disabled/disallowed.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10760
Test Plan: Ran some batched, non-batched, and CF consistency stress tests using the script.
Reviewed By: riversand963
Differential Revision: D39939032
Pulled By: ltamasi
fbshipit-source-id: eafdf124e95993fb7d73158e3b006d11819f7fa9
2022-09-30 18:11:07 +00:00
|
|
|
assert(shared);
|
|
|
|
|
|
|
|
const int64_t max_key = shared->GetMaxKey();
|
|
|
|
|
2019-12-09 07:49:32 +00:00
|
|
|
int64_t rand_key = rand_keys[0];
|
|
|
|
int rand_column_family = rand_column_families[0];
|
Add the PutEntity API to the stress/crash tests (#10760)
Summary:
The patch adds the `PutEntity` API to the non-batched, batched, and
CF consistency stress tests. Namely, when the new `db_stress` command
line parameter `use_put_entity_one_in` is greater than zero, one in
N writes on average is performed using `PutEntity` rather than `Put`.
The wide-column entity written has the generated value in its default
column; in addition, it contains up to three additional columns where
the original generated value is divided up between the column name and the
column value (with the column name containing the first k characters of
the generated value, and the column value containing the rest). Whether
`PutEntity` is used (and if so, how many columns the entity has) is completely
determined by the "value base" used to generate the value (that is, there is
no randomness involved). Assuming the same `use_put_entity_one_in` setting
is used across `db_stress` invocations, this enables us to reconstruct and
validate the entity during subsequent `db_stress` runs.
Note that `PutEntity` is currently incompatible with `Merge`, transactions, and
user-defined timestamps; these combinations are currently disabled/disallowed.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10760
Test Plan: Ran some batched, non-batched, and CF consistency stress tests using the script.
Reviewed By: riversand963
Differential Revision: D39939032
Pulled By: ltamasi
fbshipit-source-id: eafdf124e95993fb7d73158e3b006d11819f7fa9
2022-09-30 18:11:07 +00:00
|
|
|
std::string write_ts;
|
|
|
|
|
2022-09-15 22:55:37 +00:00
|
|
|
std::unique_ptr<MutexLock> lock(
|
|
|
|
new MutexLock(shared->GetMutexForKey(rand_column_family, rand_key)));
|
2019-12-09 07:49:32 +00:00
|
|
|
while (!shared->AllowsOverwrite(rand_key) &&
|
|
|
|
(FLAGS_use_merge || shared->Exists(rand_column_family, rand_key))) {
|
|
|
|
lock.reset();
|
Add the PutEntity API to the stress/crash tests (#10760)
Summary:
The patch adds the `PutEntity` API to the non-batched, batched, and
CF consistency stress tests. Namely, when the new `db_stress` command
line parameter `use_put_entity_one_in` is greater than zero, one in
N writes on average is performed using `PutEntity` rather than `Put`.
The wide-column entity written has the generated value in its default
column; in addition, it contains up to three additional columns where
the original generated value is divided up between the column name and the
column value (with the column name containing the first k characters of
the generated value, and the column value containing the rest). Whether
`PutEntity` is used (and if so, how many columns the entity has) is completely
determined by the "value base" used to generate the value (that is, there is
no randomness involved). Assuming the same `use_put_entity_one_in` setting
is used across `db_stress` invocations, this enables us to reconstruct and
validate the entity during subsequent `db_stress` runs.
Note that `PutEntity` is currently incompatible with `Merge`, transactions, and
user-defined timestamps; these combinations are currently disabled/disallowed.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10760
Test Plan: Ran some batched, non-batched, and CF consistency stress tests using the script.
Reviewed By: riversand963
Differential Revision: D39939032
Pulled By: ltamasi
fbshipit-source-id: eafdf124e95993fb7d73158e3b006d11819f7fa9
2022-09-30 18:11:07 +00:00
|
|
|
|
2019-12-09 07:49:32 +00:00
|
|
|
rand_key = thread->rand.Next() % max_key;
|
|
|
|
rand_column_family = thread->rand.Next() % FLAGS_column_families;
|
Add the PutEntity API to the stress/crash tests (#10760)
Summary:
The patch adds the `PutEntity` API to the non-batched, batched, and
CF consistency stress tests. Namely, when the new `db_stress` command
line parameter `use_put_entity_one_in` is greater than zero, one in
N writes on average is performed using `PutEntity` rather than `Put`.
The wide-column entity written has the generated value in its default
column; in addition, it contains up to three additional columns where
the original generated value is divided up between the column name and the
column value (with the column name containing the first k characters of
the generated value, and the column value containing the rest). Whether
`PutEntity` is used (and if so, how many columns the entity has) is completely
determined by the "value base" used to generate the value (that is, there is
no randomness involved). Assuming the same `use_put_entity_one_in` setting
is used across `db_stress` invocations, this enables us to reconstruct and
validate the entity during subsequent `db_stress` runs.
Note that `PutEntity` is currently incompatible with `Merge`, transactions, and
user-defined timestamps; these combinations are currently disabled/disallowed.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10760
Test Plan: Ran some batched, non-batched, and CF consistency stress tests using the script.
Reviewed By: riversand963
Differential Revision: D39939032
Pulled By: ltamasi
fbshipit-source-id: eafdf124e95993fb7d73158e3b006d11819f7fa9
2022-09-30 18:11:07 +00:00
|
|
|
|
2019-12-09 07:49:32 +00:00
|
|
|
lock.reset(
|
|
|
|
new MutexLock(shared->GetMutexForKey(rand_column_family, rand_key)));
|
Add user-defined timestamps to db_stress (#8061)
Summary:
Add some basic test for user-defined timestamp to db_stress. Currently,
read with timestamp always tries to read using the current timestamp.
Due to the per-key timestamp-sequence ordering constraint, we only add timestamp-
related tests to the `NonBatchedOpsStressTest` since this test serializes accesses
to the same key and uses a file to cross-check data correctness.
The timestamp feature is not supported in a number of components, e.g. Merge, SingleDelete,
DeleteRange, CompactionFilter, Readonly instance, secondary instance, SST file ingestion, transaction,
etc. Therefore, db_stress should exit if user enables both timestamp and these features at the same
time. The (currently) incompatible features can be found in
`CheckAndSetOptionsForUserTimestamp`.
This PR also fixes a bug triggered when timestamp is enabled together with
`index_type=kBinarySearchWithFirstKey`. This bug fix will also be in another separate PR
with more unit tests coverage. Fixing it here because I do not want to exclude the index type
from crash test.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8061
Test Plan: make crash_test_with_ts
Reviewed By: jay-zhuang
Differential Revision: D27056282
Pulled By: riversand963
fbshipit-source-id: c3e00ad1023fdb9ebbdf9601ec18270c5e2925a9
2021-03-23 12:12:04 +00:00
|
|
|
if (FLAGS_user_timestamp_size > 0) {
|
Add the PutEntity API to the stress/crash tests (#10760)
Summary:
The patch adds the `PutEntity` API to the non-batched, batched, and
CF consistency stress tests. Namely, when the new `db_stress` command
line parameter `use_put_entity_one_in` is greater than zero, one in
N writes on average is performed using `PutEntity` rather than `Put`.
The wide-column entity written has the generated value in its default
column; in addition, it contains up to three additional columns where
the original generated value is divided up between the column name and the
column value (with the column name containing the first k characters of
the generated value, and the column value containing the rest). Whether
`PutEntity` is used (and if so, how many columns the entity has) is completely
determined by the "value base" used to generate the value (that is, there is
no randomness involved). Assuming the same `use_put_entity_one_in` setting
is used across `db_stress` invocations, this enables us to reconstruct and
validate the entity during subsequent `db_stress` runs.
Note that `PutEntity` is currently incompatible with `Merge`, transactions, and
user-defined timestamps; these combinations are currently disabled/disallowed.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10760
Test Plan: Ran some batched, non-batched, and CF consistency stress tests using the script.
Reviewed By: riversand963
Differential Revision: D39939032
Pulled By: ltamasi
fbshipit-source-id: eafdf124e95993fb7d73158e3b006d11819f7fa9
2022-09-30 18:11:07 +00:00
|
|
|
write_ts = GetNowNanos();
|
Add user-defined timestamps to db_stress (#8061)
Summary:
Add some basic test for user-defined timestamp to db_stress. Currently,
read with timestamp always tries to read using the current timestamp.
Due to the per-key timestamp-sequence ordering constraint, we only add timestamp-
related tests to the `NonBatchedOpsStressTest` since this test serializes accesses
to the same key and uses a file to cross-check data correctness.
The timestamp feature is not supported in a number of components, e.g. Merge, SingleDelete,
DeleteRange, CompactionFilter, Readonly instance, secondary instance, SST file ingestion, transaction,
etc. Therefore, db_stress should exit if user enables both timestamp and these features at the same
time. The (currently) incompatible features can be found in
`CheckAndSetOptionsForUserTimestamp`.
This PR also fixes a bug triggered when timestamp is enabled together with
`index_type=kBinarySearchWithFirstKey`. This bug fix will also be in another separate PR
with more unit tests coverage. Fixing it here because I do not want to exclude the index type
from crash test.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8061
Test Plan: make crash_test_with_ts
Reviewed By: jay-zhuang
Differential Revision: D27056282
Pulled By: riversand963
fbshipit-source-id: c3e00ad1023fdb9ebbdf9601ec18270c5e2925a9
2021-03-23 12:12:04 +00:00
|
|
|
}
|
2019-12-09 07:49:32 +00:00
|
|
|
}
|
Add the PutEntity API to the stress/crash tests (#10760)
Summary:
The patch adds the `PutEntity` API to the non-batched, batched, and
CF consistency stress tests. Namely, when the new `db_stress` command
line parameter `use_put_entity_one_in` is greater than zero, one in
N writes on average is performed using `PutEntity` rather than `Put`.
The wide-column entity written has the generated value in its default
column; in addition, it contains up to three additional columns where
the original generated value is divided up between the column name and the
column value (with the column name containing the first k characters of
the generated value, and the column value containing the rest). Whether
`PutEntity` is used (and if so, how many columns the entity has) is completely
determined by the "value base" used to generate the value (that is, there is
no randomness involved). Assuming the same `use_put_entity_one_in` setting
is used across `db_stress` invocations, this enables us to reconstruct and
validate the entity during subsequent `db_stress` runs.
Note that `PutEntity` is currently incompatible with `Merge`, transactions, and
user-defined timestamps; these combinations are currently disabled/disallowed.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10760
Test Plan: Ran some batched, non-batched, and CF consistency stress tests using the script.
Reviewed By: riversand963
Differential Revision: D39939032
Pulled By: ltamasi
fbshipit-source-id: eafdf124e95993fb7d73158e3b006d11819f7fa9
2022-09-30 18:11:07 +00:00
|
|
|
|
|
|
|
if (write_ts.empty() && FLAGS_user_timestamp_size) {
|
|
|
|
write_ts = GetNowNanos();
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
}
|
2019-12-09 07:49:32 +00:00
|
|
|
|
Add the PutEntity API to the stress/crash tests (#10760)
Summary:
The patch adds the `PutEntity` API to the non-batched, batched, and
CF consistency stress tests. Namely, when the new `db_stress` command
line parameter `use_put_entity_one_in` is greater than zero, one in
N writes on average is performed using `PutEntity` rather than `Put`.
The wide-column entity written has the generated value in its default
column; in addition, it contains up to three additional columns where
the original generated value is divided up between the column name and the
column value (with the column name containing the first k characters of
the generated value, and the column value containing the rest). Whether
`PutEntity` is used (and if so, how many columns the entity has) is completely
determined by the "value base" used to generate the value (that is, there is
no randomness involved). Assuming the same `use_put_entity_one_in` setting
is used across `db_stress` invocations, this enables us to reconstruct and
validate the entity during subsequent `db_stress` runs.
Note that `PutEntity` is currently incompatible with `Merge`, transactions, and
user-defined timestamps; these combinations are currently disabled/disallowed.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10760
Test Plan: Ran some batched, non-batched, and CF consistency stress tests using the script.
Reviewed By: riversand963
Differential Revision: D39939032
Pulled By: ltamasi
fbshipit-source-id: eafdf124e95993fb7d73158e3b006d11819f7fa9
2022-09-30 18:11:07 +00:00
|
|
|
const std::string k = Key(rand_key);
|
|
|
|
|
|
|
|
ColumnFamilyHandle* const cfh = column_families_[rand_column_family];
|
|
|
|
assert(cfh);
|
2019-12-09 07:49:32 +00:00
|
|
|
|
|
|
|
if (FLAGS_verify_before_write) {
|
Inject more errors to more files in stress test (#12713)
Summary:
**Context:**
We currently have partial error injection:
- DB operation: all read, SST write
- DB open: all read, SST write, all metadata write.
This PR completes the error injection (with some limitations below):
- DB operation & open: all read, all write, all metadata write, all metadata read
**Summary:**
- Inject retryable metadata read, metadata write error concerning directory (e.g, dir sync, ) or file metadata (e.g, name, size, file creation/deletion...)
- Inject retryable errors to all major file types: random access file, sequential file, writable file
- Allow db stress test operations to handle above injected errors gracefully without crashing
- Change all error injection to thread-local implementation for easier disabling and enabling in the same thread. For example, we can control error handling thread to have no error injection. It's also cleaner in code.
- Limitation: compared to before, we now don't have write fault injection for backup/restore CopyOrCreateFiles work threads since they use anonymous background threads as well as read injection for db open bg thread
- Add a new flag to test error recovery without error injection so we can test the path where error recovery actually succeeds
- Some Refactory & fix to db stress test framework (see PR review comments)
- Fix some minor bugs surfaced (see PR review comments)
- Limitation: had to disable backup restore with metadata read/write injection since it surfaces too many testing issues. Will add it back later to focus on surfacing actual code/internal bugs first.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12713
Test Plan:
- Existing UT
- CI with no trivial error failure
Reviewed By: pdillinger
Differential Revision: D58326608
Pulled By: hx235
fbshipit-source-id: 011b5195aaeb6011641ae0a9194f7f2a0e325ad7
2024-06-19 15:42:00 +00:00
|
|
|
// Temporarily disable error injection for preparation
|
Fix nullptr access and race to fault_fs_guard (#12799)
Summary:
**Context/Summary:**
There are a couple places where we forgot to check fault_fs_guard before accessing it. So we can see something like this occasionally
```
=138831==Hint: address points to the zero page.
SCARINESS: 10 (null-deref)
AddressSanitizer:DEADLYSIGNAL
#0 0x18b9e0b in rocksdb::ThreadLocalPtr::Get() const fbcode/internal_repo_rocksdb/repo/util/thread_local.cc:503
https://github.com/facebook/rocksdb/issues/1 0x83d8b7 in rocksdb::StressTest::TestCompactRange(rocksdb::ThreadState*, long, rocksdb::Slice const&, rocksdb::ColumnFamilyHandle*) fbcode/internal_repo_rocksdb/repo/utilities/fault_injection_fs.h
```
Also accessing of `io_activties_exempted_from_fault_injection.find` not fully synced so we see the following
```
WARNING: ThreadSanitizer: data race (pid=90939)
Write of size 8 at 0x7b4c000004d0 by thread T762 (mutexes: write M0):
#0 std::_Rb_tree<rocksdb::Env::IOActivity, rocksdb::Env::IOActivity, std::_Identity<rocksdb::Env::IOActivity>, std::less<rocksdb::Env::IOActivity>, std::allocator<rocksdb::Env::IOActivity>>::operator=(std::_Rb_tree<rocksdb::Env::IOActivity, rocksdb::Env::IOActivity, std::_Identity<rocksdb::Env::IOActivity>, std::less<rocksdb::Env::IOActivity>, std::allocator<rocksdb::Env::IOActivity>> const&) fbcode/third-party-buck/platform010/build/libgcc/include/c++/trunk/bits/stl_tree.h:208 (db_stress+0x411c32) (BuildId: b803e5aca22c6b080defed8e85b7bfec)
https://github.com/facebook/rocksdb/issues/1 rocksdb::DbStressListener::OnErrorRecoveryCompleted(rocksdb::Status) fbcode/third-party-buck/platform010/build/libgcc/include/c++/trunk/bits/stl_set.h:298 (db_stress+0x4112e5) (BuildId: b803e5aca22c6b080defed8e85b7bfec)
https://github.com/facebook/rocksdb/issues/2 rocksdb::EventHelpers::NotifyOnErrorRecoveryEnd(std::vector<std::shared_ptr<rocksdb::EventListener>, std::allocator<std::shared_ptr<rocksdb::EventListener>>> const&, rocksdb::Status const&, rocksdb::Status const&, rocksdb::InstrumentedMutex*) fbcode/internal_repo_rocksdb/repo/db/event_helpers.cc:239 (db_stress+0xa09d60) (BuildId: b803e5aca22c6b080defed8e85b7bfec)
Previous read of size 8 at 0x7b4c000004d0 by thread T131 (mutexes: write M1):
#0 rocksdb::FaultInjectionTestFS::MaybeInjectThreadLocalError(rocksdb::FaultInjectionIOType, rocksdb::IOOptions const&, rocksdb::FaultInjectionTestFS::ErrorOperation, rocksdb::Slice*, bool, char*, bool, bool*) fbcode/third-party-buck/platform010/build/libgcc/include/c++/trunk/bits/stl_tree.h:798 (db_stress+0xf7d0f3) (BuildId: b803e5aca22c6b080defed8e85b7bfec)
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12799
Test Plan: CI
Reviewed By: jowlyzhang
Differential Revision: D58917449
Pulled By: hx235
fbshipit-source-id: f24fc1acc2a7d91f9f285447a97ba41397f48dbd
2024-06-24 23:10:36 +00:00
|
|
|
if (fault_fs_guard) {
|
|
|
|
fault_fs_guard->DisableThreadLocalErrorInjection(
|
|
|
|
FaultInjectionIOType::kRead);
|
|
|
|
fault_fs_guard->DisableThreadLocalErrorInjection(
|
|
|
|
FaultInjectionIOType::kMetadataRead);
|
|
|
|
}
|
|
|
|
|
2019-12-09 07:49:32 +00:00
|
|
|
std::string from_db;
|
|
|
|
Status s = db_->Get(read_opts, cfh, k, &from_db);
|
Inject more errors to more files in stress test (#12713)
Summary:
**Context:**
We currently have partial error injection:
- DB operation: all read, SST write
- DB open: all read, SST write, all metadata write.
This PR completes the error injection (with some limitations below):
- DB operation & open: all read, all write, all metadata write, all metadata read
**Summary:**
- Inject retryable metadata read, metadata write error concerning directory (e.g, dir sync, ) or file metadata (e.g, name, size, file creation/deletion...)
- Inject retryable errors to all major file types: random access file, sequential file, writable file
- Allow db stress test operations to handle above injected errors gracefully without crashing
- Change all error injection to thread-local implementation for easier disabling and enabling in the same thread. For example, we can control error handling thread to have no error injection. It's also cleaner in code.
- Limitation: compared to before, we now don't have write fault injection for backup/restore CopyOrCreateFiles work threads since they use anonymous background threads as well as read injection for db open bg thread
- Add a new flag to test error recovery without error injection so we can test the path where error recovery actually succeeds
- Some Refactory & fix to db stress test framework (see PR review comments)
- Fix some minor bugs surfaced (see PR review comments)
- Limitation: had to disable backup restore with metadata read/write injection since it surfaces too many testing issues. Will add it back later to focus on surfacing actual code/internal bugs first.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12713
Test Plan:
- Existing UT
- CI with no trivial error failure
Reviewed By: pdillinger
Differential Revision: D58326608
Pulled By: hx235
fbshipit-source-id: 011b5195aaeb6011641ae0a9194f7f2a0e325ad7
2024-06-19 15:42:00 +00:00
|
|
|
bool res = VerifyOrSyncValue(
|
|
|
|
rand_column_family, rand_key, read_opts, shared,
|
|
|
|
/* msg_prefix */ "Pre-Put Get verification", from_db, s);
|
|
|
|
|
|
|
|
// Enable back error injection disabled for preparation
|
Fix nullptr access and race to fault_fs_guard (#12799)
Summary:
**Context/Summary:**
There are a couple places where we forgot to check fault_fs_guard before accessing it. So we can see something like this occasionally
```
=138831==Hint: address points to the zero page.
SCARINESS: 10 (null-deref)
AddressSanitizer:DEADLYSIGNAL
#0 0x18b9e0b in rocksdb::ThreadLocalPtr::Get() const fbcode/internal_repo_rocksdb/repo/util/thread_local.cc:503
https://github.com/facebook/rocksdb/issues/1 0x83d8b7 in rocksdb::StressTest::TestCompactRange(rocksdb::ThreadState*, long, rocksdb::Slice const&, rocksdb::ColumnFamilyHandle*) fbcode/internal_repo_rocksdb/repo/utilities/fault_injection_fs.h
```
Also accessing of `io_activties_exempted_from_fault_injection.find` not fully synced so we see the following
```
WARNING: ThreadSanitizer: data race (pid=90939)
Write of size 8 at 0x7b4c000004d0 by thread T762 (mutexes: write M0):
#0 std::_Rb_tree<rocksdb::Env::IOActivity, rocksdb::Env::IOActivity, std::_Identity<rocksdb::Env::IOActivity>, std::less<rocksdb::Env::IOActivity>, std::allocator<rocksdb::Env::IOActivity>>::operator=(std::_Rb_tree<rocksdb::Env::IOActivity, rocksdb::Env::IOActivity, std::_Identity<rocksdb::Env::IOActivity>, std::less<rocksdb::Env::IOActivity>, std::allocator<rocksdb::Env::IOActivity>> const&) fbcode/third-party-buck/platform010/build/libgcc/include/c++/trunk/bits/stl_tree.h:208 (db_stress+0x411c32) (BuildId: b803e5aca22c6b080defed8e85b7bfec)
https://github.com/facebook/rocksdb/issues/1 rocksdb::DbStressListener::OnErrorRecoveryCompleted(rocksdb::Status) fbcode/third-party-buck/platform010/build/libgcc/include/c++/trunk/bits/stl_set.h:298 (db_stress+0x4112e5) (BuildId: b803e5aca22c6b080defed8e85b7bfec)
https://github.com/facebook/rocksdb/issues/2 rocksdb::EventHelpers::NotifyOnErrorRecoveryEnd(std::vector<std::shared_ptr<rocksdb::EventListener>, std::allocator<std::shared_ptr<rocksdb::EventListener>>> const&, rocksdb::Status const&, rocksdb::Status const&, rocksdb::InstrumentedMutex*) fbcode/internal_repo_rocksdb/repo/db/event_helpers.cc:239 (db_stress+0xa09d60) (BuildId: b803e5aca22c6b080defed8e85b7bfec)
Previous read of size 8 at 0x7b4c000004d0 by thread T131 (mutexes: write M1):
#0 rocksdb::FaultInjectionTestFS::MaybeInjectThreadLocalError(rocksdb::FaultInjectionIOType, rocksdb::IOOptions const&, rocksdb::FaultInjectionTestFS::ErrorOperation, rocksdb::Slice*, bool, char*, bool, bool*) fbcode/third-party-buck/platform010/build/libgcc/include/c++/trunk/bits/stl_tree.h:798 (db_stress+0xf7d0f3) (BuildId: b803e5aca22c6b080defed8e85b7bfec)
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12799
Test Plan: CI
Reviewed By: jowlyzhang
Differential Revision: D58917449
Pulled By: hx235
fbshipit-source-id: f24fc1acc2a7d91f9f285447a97ba41397f48dbd
2024-06-24 23:10:36 +00:00
|
|
|
if (fault_fs_guard) {
|
|
|
|
fault_fs_guard->EnableThreadLocalErrorInjection(
|
|
|
|
FaultInjectionIOType::kRead);
|
|
|
|
fault_fs_guard->EnableThreadLocalErrorInjection(
|
|
|
|
FaultInjectionIOType::kMetadataRead);
|
|
|
|
}
|
Inject more errors to more files in stress test (#12713)
Summary:
**Context:**
We currently have partial error injection:
- DB operation: all read, SST write
- DB open: all read, SST write, all metadata write.
This PR completes the error injection (with some limitations below):
- DB operation & open: all read, all write, all metadata write, all metadata read
**Summary:**
- Inject retryable metadata read, metadata write error concerning directory (e.g, dir sync, ) or file metadata (e.g, name, size, file creation/deletion...)
- Inject retryable errors to all major file types: random access file, sequential file, writable file
- Allow db stress test operations to handle above injected errors gracefully without crashing
- Change all error injection to thread-local implementation for easier disabling and enabling in the same thread. For example, we can control error handling thread to have no error injection. It's also cleaner in code.
- Limitation: compared to before, we now don't have write fault injection for backup/restore CopyOrCreateFiles work threads since they use anonymous background threads as well as read injection for db open bg thread
- Add a new flag to test error recovery without error injection so we can test the path where error recovery actually succeeds
- Some Refactory & fix to db stress test framework (see PR review comments)
- Fix some minor bugs surfaced (see PR review comments)
- Limitation: had to disable backup restore with metadata read/write injection since it surfaces too many testing issues. Will add it back later to focus on surfacing actual code/internal bugs first.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12713
Test Plan:
- Existing UT
- CI with no trivial error failure
Reviewed By: pdillinger
Differential Revision: D58326608
Pulled By: hx235
fbshipit-source-id: 011b5195aaeb6011641ae0a9194f7f2a0e325ad7
2024-06-19 15:42:00 +00:00
|
|
|
if (!res) {
|
2019-12-09 07:49:32 +00:00
|
|
|
return s;
|
|
|
|
}
|
|
|
|
}
|
Add the PutEntity API to the stress/crash tests (#10760)
Summary:
The patch adds the `PutEntity` API to the non-batched, batched, and
CF consistency stress tests. Namely, when the new `db_stress` command
line parameter `use_put_entity_one_in` is greater than zero, one in
N writes on average is performed using `PutEntity` rather than `Put`.
The wide-column entity written has the generated value in its default
column; in addition, it contains up to three additional columns where
the original generated value is divided up between the column name and the
column value (with the column name containing the first k characters of
the generated value, and the column value containing the rest). Whether
`PutEntity` is used (and if so, how many columns the entity has) is completely
determined by the "value base" used to generate the value (that is, there is
no randomness involved). Assuming the same `use_put_entity_one_in` setting
is used across `db_stress` invocations, this enables us to reconstruct and
validate the entity during subsequent `db_stress` runs.
Note that `PutEntity` is currently incompatible with `Merge`, transactions, and
user-defined timestamps; these combinations are currently disabled/disallowed.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10760
Test Plan: Ran some batched, non-batched, and CF consistency stress tests using the script.
Reviewed By: riversand963
Differential Revision: D39939032
Pulled By: ltamasi
fbshipit-source-id: eafdf124e95993fb7d73158e3b006d11819f7fa9
2022-09-30 18:11:07 +00:00
|
|
|
|
2024-07-29 20:51:49 +00:00
|
|
|
// To track the final write status
|
|
|
|
Status s;
|
|
|
|
// To track the initial write status
|
|
|
|
Status initial_write_s;
|
|
|
|
// To track whether WAL write may have succeeded during the initial failed
|
|
|
|
// write
|
|
|
|
bool initial_wal_write_may_succeed = true;
|
|
|
|
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
PendingExpectedValue pending_expected_value =
|
Fix false-positive TestBackupRestore corruption (#12917)
Summary:
**Context:**
https://github.com/facebook/rocksdb/pull/12838 allows a write thread encountered certain injected error to release the lock and sleep before retrying write in order to reduce performance cost. This requires adding checks like [this](https://github.com/facebook/rocksdb/blob/b26b395e0a15255d322be08110db551976188745/db_stress_tool/expected_value.cc#L29-L31) to prevent writing to the same key from another thread.
The added check causes a false-positive failure when delete range + file ingestion + backup is used. Consider the following scenario:
(1) Issue a delete range covering some key that do not exist and a key does exist (named as k1). k1 will have "pending delete" state while the keys that does not exit will have whatever state they already have since we don't delete a key that does not exist already.
(2) After https://github.com/facebook/rocksdb/pull/12838, `PrepareDeleteRange(... &prepared)` will return `prepared = false`. So below logic will be executed and k1's "pending delete" won't get roll-backed nor committed.
```
std::vector<PendingExpectedValue> pending_expected_values =
shared->PrepareDeleteRange(rand_column_family, rand_key,
rand_key + FLAGS_range_deletion_width,
&prepared);
if (!prepared) {
for (PendingExpectedValue& pending_expected_value :
pending_expected_values) {
pending_expected_value.PermitUnclosedPendingState();
}
return s;
}
```
(3) Issue an file ingestion covering k1 and another key k2. Similar to (2), we will have `shared->PreparePut(column_family, key, &prepared)` return `prepared = false` for k1 while k2 will have a "pending put" state. So below logic will be executed and k2's "pending put" state won't get roll-backed nor committed.
```
for (int64_t key = key_base;
s.ok() && key < shared->GetMaxKey() &&
static_cast<int32_t>(keys.size()) < FLAGS_ingest_external_file_width;
++key)
PendingExpectedValue pending_expected_value =
shared->PreparePut(column_family, key, &prepared);
if (!prepared) {
pending_expected_value.PermitUnclosedPendingState();
for (PendingExpectedValue& pev : pending_expected_values) {
pev.PermitUnclosedPendingState();
}
return;
}
}
```
(4) Issue a backup and verify on k2. Below logic decides that k2 should exist in restored DB since it has a pending write state while k2 is never ingested into the original DB as (3) returns early.
```
bool Exists() const { return PendingPut() || !IsDeleted(); }
TestBackupRestore() {
...
Status get_status = restored_db->Get(
read_opts, restored_cf_handles[rand_column_families[i]], key,
&restored_value);
bool exists = thread->shared->Exists(rand_column_families[i], rand_keys[0]);
if (get_status.ok()) {
if (!exists && from_latest && ShouldAcquireMutexOnKey()) {
std::ostringstream oss;
oss << "0x" << key.ToString(true)
<< " exists in restore but not in original db";
s = Status::Corruption(oss.str());
}
} else if (get_status.IsNotFound()) {
if (exists && from_latest && ShouldAcquireMutexOnKey()) {
std::ostringstream oss;
oss << "0x" << key.ToString(true)
<< " exists in original db but not in restore";
s = Status::Corruption(oss.str());
}
}
...
}
```
So we see false-positive corruption like `Failure in a backup/restore operation with: Corruption: 0x000000000000017B0000000000000073787878 exists in original db but not in restore`
A simple fix is to remove `PendingPut()` from `bool Exists() ` since it's called under a lock and should never see a pending write. However, in order for "under a lock and should never see a pending write" to be true, we need to remove the logic of releasing the lock during sleep in the write thread, which expose pending write to other thread that can call Exists() like back up thread.
The downside of holding lock during sleep is blocking other write thread of the same key to proceed cuz they need to wait for the lock. This should happen rarely as the key of a thread is selected randomly in crash test like below.
```
void StressTest::OperateDb(ThreadState* thread) {
for (uint64_t i = 0; i < ops_per_open; i++) {
...
int64_t rand_key = GenerateOneKey(thread, i);
...
}
}
```
**Summary:**
- Removed the "lock release" part and related checks
- Printed recovery time if the write thread waited more than 10 seconds
- Reverted regression in testing coverage when deleting a non-existent key
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12917
Test Plan:
Below command repro-ed frequently before the fix and not after.
```
./db_stress --WAL_size_limit_MB=1 --WAL_ttl_seconds=60 --acquire_snapshot_one_in=0 --adaptive_readahead=0 --adm_policy=1 --advise_random_on_open=1 --allow_concurrent_memtable_write=0 --allow_data_in_errors=True --allow_fallocate=0 --allow_setting_blob_options_dynamically=1 --async_io=0 --auto_readahead_size=1 --avoid_flush_during_recovery=0 --avoid_flush_during_shutdown=0 --avoid_unnecessary_blocking_io=0 --backup_max_size=104857600 --backup_one_in=100000 --batch_protection_bytes_per_key=0 --bgerror_resume_retry_interval=100 --blob_cache_size=8388608 --blob_compaction_readahead_size=1048576 --blob_compression_type=none --blob_file_size=1073741824 --blob_file_starting_level=1 --blob_garbage_collection_age_cutoff=0.0 --blob_garbage_collection_force_threshold=0.75 --block_align=0 --block_protection_bytes_per_key=8 --block_size=16384 --bloom_before_level=2147483647 --bloom_bits=16.216959977115277 --bottommost_compression_type=xpress --bottommost_file_compaction_delay=600 --bytes_per_sync=262144 --cache_index_and_filter_blocks=1 --cache_index_and_filter_blocks_with_high_priority=1 --cache_size=8388608 --cache_type=lru_cache --charge_compression_dictionary_building_buffer=1 --charge_file_metadata=0 --charge_filter_construction=0 --charge_table_reader=1 --check_multiget_consistency=0 --check_multiget_entity_consistency=0 --checkpoint_one_in=1000000 --checksum_type=kXXH3 --clear_column_family_one_in=0 --column_families=1 --compact_files_one_in=1000 --compact_range_one_in=0 --compaction_pri=3 --compaction_readahead_size=0 --compaction_ttl=10 --compress_format_version=2 --compressed_secondary_cache_size=8388608 --compression_checksum=0 --compression_max_dict_buffer_bytes=2097151 --compression_max_dict_bytes=16384 --compression_parallel_threads=1 --compression_type=zlib --compression_use_zstd_dict_trainer=0 --compression_zstd_max_train_bytes=0 --continuous_verification_interval=0 --daily_offpeak_time_utc=04:00-08:00 --data_block_index_type=0 --db=/dev/shm/rocksdb_test/rocksdb_crashtest_whitebox --db_write_buffer_size=0 --default_temperature=kUnknown --default_write_temperature=kWarm --delete_obsolete_files_period_micros=21600000000 --delpercent=0 --delrangepercent=5 --destroy_db_initially=0 --detect_filter_construct_corruption=1 --disable_file_deletions_one_in=10000 --disable_manual_compaction_one_in=1000000 --disable_wal=0 --dump_malloc_stats=0 --enable_blob_files=0 --enable_blob_garbage_collection=1 --enable_checksum_handoff=1 --enable_compaction_filter=1 --enable_custom_split_merge=1 --enable_do_not_compress_roles=0 --enable_index_compression=1 --enable_memtable_insert_with_hint_prefix_extractor=0 --enable_pipelined_write=1 --enable_sst_partitioner_factory=1 --enable_thread_tracking=0 --enable_write_thread_adaptive_yield=0 --error_recovery_with_no_fault_injection=1 --exclude_wal_from_write_fault_injection=1 --expected_values_dir=/dev/shm/rocksdb_test/rocksdb_crashtest_expected --fail_if_options_file_error=0 --fifo_allow_compaction=1 --file_checksum_impl=big --fill_cache=1 --flush_one_in=1000000 --format_version=2 --get_all_column_family_metadata_one_in=10000 --get_current_wal_file_one_in=0 --get_live_files_apis_one_in=1000000 --get_properties_of_all_tables_one_in=100000 --get_property_one_in=100000 --get_sorted_wal_files_one_in=0 --hard_pending_compaction_bytes_limit=2097152 --high_pri_pool_ratio=0.5 --index_block_restart_interval=1 --index_shortening=2 --index_type=0 --ingest_external_file_one_in=1000 --initial_auto_readahead_size=0 --inplace_update_support=0 --iterpercent=0 --key_len_percent_dist=1,30,69 --key_may_exist_one_in=100 --last_level_temperature=kUnknown --level_compaction_dynamic_level_bytes=0 --lock_wal_one_in=10000 --log2_keys_per_lock=10 --log_file_time_to_roll=0 --log_readahead_size=0 --long_running_snapshots=1 --low_pri_pool_ratio=0.5 --lowest_used_cache_tier=1 --manifest_preallocation_size=0 --manual_wal_flush_one_in=0 --mark_for_compaction_one_file_in=10 --max_auto_readahead_size=16384 --max_background_compactions=1 --max_bytes_for_level_base=67108864 --max_key=100000 --max_key_len=3 --max_log_file_size=1048576 --max_manifest_file_size=1073741824 --max_sequential_skip_in_iterations=16 --max_total_wal_size=0 --max_write_batch_group_size_bytes=16 --max_write_buffer_number=10 --max_write_buffer_size_to_maintain=8388608 --memtable_insert_hint_per_batch=1 --memtable_max_range_deletions=1000 --memtable_prefix_bloom_size_ratio=0.001 --memtable_protection_bytes_per_key=4 --memtable_whole_key_filtering=1 --memtablerep=skip_list --metadata_charge_policy=1 --metadata_read_fault_one_in=0 --metadata_write_fault_one_in=0 --min_blob_size=16 --min_write_buffer_number_to_merge=2 --mmap_read=0 --mock_direct_io=False --nooverwritepercent=1 --num_file_reads_for_auto_readahead=0 --open_files=-1 --open_metadata_read_fault_one_in=0 --open_metadata_write_fault_one_in=0 --open_read_fault_one_in=0 --open_write_fault_one_in=0 --ops_per_thread=20000000 --optimize_filters_for_hits=1 --optimize_filters_for_memory=0 --optimize_multiget_for_io=1 --paranoid_file_checks=1 --partition_filters=0 --partition_pinning=1 --pause_background_one_in=10000 --periodic_compaction_seconds=10 --prefix_size=8 --prefixpercent=0 --prepopulate_blob_cache=1 --prepopulate_block_cache=1 --preserve_internal_time_seconds=0 --progress_reports=0 --promote_l0_one_in=0 --read_amp_bytes_per_bit=0 --read_fault_one_in=0 --readahead_size=524288 --readpercent=60 --recycle_log_file_num=1 --reopen=20 --report_bg_io_stats=0 --reset_stats_one_in=1000000 --sample_for_compression=5 --secondary_cache_fault_one_in=0 --secondary_cache_uri= --skip_stats_update_on_db_open=1 --snapshot_hold_ops=100000 --soft_pending_compaction_bytes_limit=68719476736 --sqfc_name=foo --sqfc_version=1 --sst_file_manager_bytes_per_sec=0 --sst_file_manager_bytes_per_truncate=0 --stats_dump_period_sec=10 --stats_history_buffer_size=1048576 --strict_bytes_per_sync=1 --subcompactions=2 --sync=0 --sync_fault_injection=0 --table_cache_numshardbits=0 --target_file_size_base=16777216 --target_file_size_multiplier=1 --test_batches_snapshots=0 --top_level_index_pinning=3 --uncache_aggressiveness=118 --universal_max_read_amp=-1 --unpartitioned_pinning=0 --use_adaptive_mutex=0 --use_adaptive_mutex_lru=1 --use_attribute_group=0 --use_blob_cache=0 --use_delta_encoding=1 --use_direct_io_for_flush_and_compaction=0 --use_direct_reads=0 --use_full_merge_v1=0 --use_get_entity=0 --use_merge=0 --use_multi_cf_iterator=0 --use_multi_get_entity=0 --use_multiget=1 --use_put_entity_one_in=0 --use_shared_block_and_blob_cache=1 --use_sqfc_for_range_queries=1 --use_timed_put_one_in=0 --use_write_buffer_manager=0 --user_timestamp_size=0 --value_size_mult=32 --verification_only=0 --verify_checksum=1 --verify_checksum_one_in=1000000 --verify_compression=0 --verify_db_one_in=10000 --verify_file_checksums_one_in=1000000 --verify_iterator_with_expected_state_one_in=5 --verify_sst_unique_id_in_manifest=1 --wal_bytes_per_sync=0 --wal_compression=none --write_buffer_size=33554432 --write_dbid_to_manifest=0 --write_fault_one_in=0 --writepercent=35
```
Reviewed By: cbi42
Differential Revision: D60890580
Pulled By: hx235
fbshipit-source-id: 401f90d6d351c7ee11088cad06fb00e54062d416
2024-08-09 21:51:36 +00:00
|
|
|
shared->PreparePut(rand_column_family, rand_key);
|
2024-07-29 20:51:49 +00:00
|
|
|
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
const uint32_t value_base = pending_expected_value.GetFinalValueBase();
|
Add the PutEntity API to the stress/crash tests (#10760)
Summary:
The patch adds the `PutEntity` API to the non-batched, batched, and
CF consistency stress tests. Namely, when the new `db_stress` command
line parameter `use_put_entity_one_in` is greater than zero, one in
N writes on average is performed using `PutEntity` rather than `Put`.
The wide-column entity written has the generated value in its default
column; in addition, it contains up to three additional columns where
the original generated value is divided up between the column name and the
column value (with the column name containing the first k characters of
the generated value, and the column value containing the rest). Whether
`PutEntity` is used (and if so, how many columns the entity has) is completely
determined by the "value base" used to generate the value (that is, there is
no randomness involved). Assuming the same `use_put_entity_one_in` setting
is used across `db_stress` invocations, this enables us to reconstruct and
validate the entity during subsequent `db_stress` runs.
Note that `PutEntity` is currently incompatible with `Merge`, transactions, and
user-defined timestamps; these combinations are currently disabled/disallowed.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10760
Test Plan: Ran some batched, non-batched, and CF consistency stress tests using the script.
Reviewed By: riversand963
Differential Revision: D39939032
Pulled By: ltamasi
fbshipit-source-id: eafdf124e95993fb7d73158e3b006d11819f7fa9
2022-09-30 18:11:07 +00:00
|
|
|
const size_t sz = GenerateValue(value_base, value, sizeof(value));
|
|
|
|
const Slice v(value, sz);
|
|
|
|
|
Fix false-positive TestBackupRestore corruption (#12917)
Summary:
**Context:**
https://github.com/facebook/rocksdb/pull/12838 allows a write thread encountered certain injected error to release the lock and sleep before retrying write in order to reduce performance cost. This requires adding checks like [this](https://github.com/facebook/rocksdb/blob/b26b395e0a15255d322be08110db551976188745/db_stress_tool/expected_value.cc#L29-L31) to prevent writing to the same key from another thread.
The added check causes a false-positive failure when delete range + file ingestion + backup is used. Consider the following scenario:
(1) Issue a delete range covering some key that do not exist and a key does exist (named as k1). k1 will have "pending delete" state while the keys that does not exit will have whatever state they already have since we don't delete a key that does not exist already.
(2) After https://github.com/facebook/rocksdb/pull/12838, `PrepareDeleteRange(... &prepared)` will return `prepared = false`. So below logic will be executed and k1's "pending delete" won't get roll-backed nor committed.
```
std::vector<PendingExpectedValue> pending_expected_values =
shared->PrepareDeleteRange(rand_column_family, rand_key,
rand_key + FLAGS_range_deletion_width,
&prepared);
if (!prepared) {
for (PendingExpectedValue& pending_expected_value :
pending_expected_values) {
pending_expected_value.PermitUnclosedPendingState();
}
return s;
}
```
(3) Issue an file ingestion covering k1 and another key k2. Similar to (2), we will have `shared->PreparePut(column_family, key, &prepared)` return `prepared = false` for k1 while k2 will have a "pending put" state. So below logic will be executed and k2's "pending put" state won't get roll-backed nor committed.
```
for (int64_t key = key_base;
s.ok() && key < shared->GetMaxKey() &&
static_cast<int32_t>(keys.size()) < FLAGS_ingest_external_file_width;
++key)
PendingExpectedValue pending_expected_value =
shared->PreparePut(column_family, key, &prepared);
if (!prepared) {
pending_expected_value.PermitUnclosedPendingState();
for (PendingExpectedValue& pev : pending_expected_values) {
pev.PermitUnclosedPendingState();
}
return;
}
}
```
(4) Issue a backup and verify on k2. Below logic decides that k2 should exist in restored DB since it has a pending write state while k2 is never ingested into the original DB as (3) returns early.
```
bool Exists() const { return PendingPut() || !IsDeleted(); }
TestBackupRestore() {
...
Status get_status = restored_db->Get(
read_opts, restored_cf_handles[rand_column_families[i]], key,
&restored_value);
bool exists = thread->shared->Exists(rand_column_families[i], rand_keys[0]);
if (get_status.ok()) {
if (!exists && from_latest && ShouldAcquireMutexOnKey()) {
std::ostringstream oss;
oss << "0x" << key.ToString(true)
<< " exists in restore but not in original db";
s = Status::Corruption(oss.str());
}
} else if (get_status.IsNotFound()) {
if (exists && from_latest && ShouldAcquireMutexOnKey()) {
std::ostringstream oss;
oss << "0x" << key.ToString(true)
<< " exists in original db but not in restore";
s = Status::Corruption(oss.str());
}
}
...
}
```
So we see false-positive corruption like `Failure in a backup/restore operation with: Corruption: 0x000000000000017B0000000000000073787878 exists in original db but not in restore`
A simple fix is to remove `PendingPut()` from `bool Exists() ` since it's called under a lock and should never see a pending write. However, in order for "under a lock and should never see a pending write" to be true, we need to remove the logic of releasing the lock during sleep in the write thread, which expose pending write to other thread that can call Exists() like back up thread.
The downside of holding lock during sleep is blocking other write thread of the same key to proceed cuz they need to wait for the lock. This should happen rarely as the key of a thread is selected randomly in crash test like below.
```
void StressTest::OperateDb(ThreadState* thread) {
for (uint64_t i = 0; i < ops_per_open; i++) {
...
int64_t rand_key = GenerateOneKey(thread, i);
...
}
}
```
**Summary:**
- Removed the "lock release" part and related checks
- Printed recovery time if the write thread waited more than 10 seconds
- Reverted regression in testing coverage when deleting a non-existent key
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12917
Test Plan:
Below command repro-ed frequently before the fix and not after.
```
./db_stress --WAL_size_limit_MB=1 --WAL_ttl_seconds=60 --acquire_snapshot_one_in=0 --adaptive_readahead=0 --adm_policy=1 --advise_random_on_open=1 --allow_concurrent_memtable_write=0 --allow_data_in_errors=True --allow_fallocate=0 --allow_setting_blob_options_dynamically=1 --async_io=0 --auto_readahead_size=1 --avoid_flush_during_recovery=0 --avoid_flush_during_shutdown=0 --avoid_unnecessary_blocking_io=0 --backup_max_size=104857600 --backup_one_in=100000 --batch_protection_bytes_per_key=0 --bgerror_resume_retry_interval=100 --blob_cache_size=8388608 --blob_compaction_readahead_size=1048576 --blob_compression_type=none --blob_file_size=1073741824 --blob_file_starting_level=1 --blob_garbage_collection_age_cutoff=0.0 --blob_garbage_collection_force_threshold=0.75 --block_align=0 --block_protection_bytes_per_key=8 --block_size=16384 --bloom_before_level=2147483647 --bloom_bits=16.216959977115277 --bottommost_compression_type=xpress --bottommost_file_compaction_delay=600 --bytes_per_sync=262144 --cache_index_and_filter_blocks=1 --cache_index_and_filter_blocks_with_high_priority=1 --cache_size=8388608 --cache_type=lru_cache --charge_compression_dictionary_building_buffer=1 --charge_file_metadata=0 --charge_filter_construction=0 --charge_table_reader=1 --check_multiget_consistency=0 --check_multiget_entity_consistency=0 --checkpoint_one_in=1000000 --checksum_type=kXXH3 --clear_column_family_one_in=0 --column_families=1 --compact_files_one_in=1000 --compact_range_one_in=0 --compaction_pri=3 --compaction_readahead_size=0 --compaction_ttl=10 --compress_format_version=2 --compressed_secondary_cache_size=8388608 --compression_checksum=0 --compression_max_dict_buffer_bytes=2097151 --compression_max_dict_bytes=16384 --compression_parallel_threads=1 --compression_type=zlib --compression_use_zstd_dict_trainer=0 --compression_zstd_max_train_bytes=0 --continuous_verification_interval=0 --daily_offpeak_time_utc=04:00-08:00 --data_block_index_type=0 --db=/dev/shm/rocksdb_test/rocksdb_crashtest_whitebox --db_write_buffer_size=0 --default_temperature=kUnknown --default_write_temperature=kWarm --delete_obsolete_files_period_micros=21600000000 --delpercent=0 --delrangepercent=5 --destroy_db_initially=0 --detect_filter_construct_corruption=1 --disable_file_deletions_one_in=10000 --disable_manual_compaction_one_in=1000000 --disable_wal=0 --dump_malloc_stats=0 --enable_blob_files=0 --enable_blob_garbage_collection=1 --enable_checksum_handoff=1 --enable_compaction_filter=1 --enable_custom_split_merge=1 --enable_do_not_compress_roles=0 --enable_index_compression=1 --enable_memtable_insert_with_hint_prefix_extractor=0 --enable_pipelined_write=1 --enable_sst_partitioner_factory=1 --enable_thread_tracking=0 --enable_write_thread_adaptive_yield=0 --error_recovery_with_no_fault_injection=1 --exclude_wal_from_write_fault_injection=1 --expected_values_dir=/dev/shm/rocksdb_test/rocksdb_crashtest_expected --fail_if_options_file_error=0 --fifo_allow_compaction=1 --file_checksum_impl=big --fill_cache=1 --flush_one_in=1000000 --format_version=2 --get_all_column_family_metadata_one_in=10000 --get_current_wal_file_one_in=0 --get_live_files_apis_one_in=1000000 --get_properties_of_all_tables_one_in=100000 --get_property_one_in=100000 --get_sorted_wal_files_one_in=0 --hard_pending_compaction_bytes_limit=2097152 --high_pri_pool_ratio=0.5 --index_block_restart_interval=1 --index_shortening=2 --index_type=0 --ingest_external_file_one_in=1000 --initial_auto_readahead_size=0 --inplace_update_support=0 --iterpercent=0 --key_len_percent_dist=1,30,69 --key_may_exist_one_in=100 --last_level_temperature=kUnknown --level_compaction_dynamic_level_bytes=0 --lock_wal_one_in=10000 --log2_keys_per_lock=10 --log_file_time_to_roll=0 --log_readahead_size=0 --long_running_snapshots=1 --low_pri_pool_ratio=0.5 --lowest_used_cache_tier=1 --manifest_preallocation_size=0 --manual_wal_flush_one_in=0 --mark_for_compaction_one_file_in=10 --max_auto_readahead_size=16384 --max_background_compactions=1 --max_bytes_for_level_base=67108864 --max_key=100000 --max_key_len=3 --max_log_file_size=1048576 --max_manifest_file_size=1073741824 --max_sequential_skip_in_iterations=16 --max_total_wal_size=0 --max_write_batch_group_size_bytes=16 --max_write_buffer_number=10 --max_write_buffer_size_to_maintain=8388608 --memtable_insert_hint_per_batch=1 --memtable_max_range_deletions=1000 --memtable_prefix_bloom_size_ratio=0.001 --memtable_protection_bytes_per_key=4 --memtable_whole_key_filtering=1 --memtablerep=skip_list --metadata_charge_policy=1 --metadata_read_fault_one_in=0 --metadata_write_fault_one_in=0 --min_blob_size=16 --min_write_buffer_number_to_merge=2 --mmap_read=0 --mock_direct_io=False --nooverwritepercent=1 --num_file_reads_for_auto_readahead=0 --open_files=-1 --open_metadata_read_fault_one_in=0 --open_metadata_write_fault_one_in=0 --open_read_fault_one_in=0 --open_write_fault_one_in=0 --ops_per_thread=20000000 --optimize_filters_for_hits=1 --optimize_filters_for_memory=0 --optimize_multiget_for_io=1 --paranoid_file_checks=1 --partition_filters=0 --partition_pinning=1 --pause_background_one_in=10000 --periodic_compaction_seconds=10 --prefix_size=8 --prefixpercent=0 --prepopulate_blob_cache=1 --prepopulate_block_cache=1 --preserve_internal_time_seconds=0 --progress_reports=0 --promote_l0_one_in=0 --read_amp_bytes_per_bit=0 --read_fault_one_in=0 --readahead_size=524288 --readpercent=60 --recycle_log_file_num=1 --reopen=20 --report_bg_io_stats=0 --reset_stats_one_in=1000000 --sample_for_compression=5 --secondary_cache_fault_one_in=0 --secondary_cache_uri= --skip_stats_update_on_db_open=1 --snapshot_hold_ops=100000 --soft_pending_compaction_bytes_limit=68719476736 --sqfc_name=foo --sqfc_version=1 --sst_file_manager_bytes_per_sec=0 --sst_file_manager_bytes_per_truncate=0 --stats_dump_period_sec=10 --stats_history_buffer_size=1048576 --strict_bytes_per_sync=1 --subcompactions=2 --sync=0 --sync_fault_injection=0 --table_cache_numshardbits=0 --target_file_size_base=16777216 --target_file_size_multiplier=1 --test_batches_snapshots=0 --top_level_index_pinning=3 --uncache_aggressiveness=118 --universal_max_read_amp=-1 --unpartitioned_pinning=0 --use_adaptive_mutex=0 --use_adaptive_mutex_lru=1 --use_attribute_group=0 --use_blob_cache=0 --use_delta_encoding=1 --use_direct_io_for_flush_and_compaction=0 --use_direct_reads=0 --use_full_merge_v1=0 --use_get_entity=0 --use_merge=0 --use_multi_cf_iterator=0 --use_multi_get_entity=0 --use_multiget=1 --use_put_entity_one_in=0 --use_shared_block_and_blob_cache=1 --use_sqfc_for_range_queries=1 --use_timed_put_one_in=0 --use_write_buffer_manager=0 --user_timestamp_size=0 --value_size_mult=32 --verification_only=0 --verify_checksum=1 --verify_checksum_one_in=1000000 --verify_compression=0 --verify_db_one_in=10000 --verify_file_checksums_one_in=1000000 --verify_iterator_with_expected_state_one_in=5 --verify_sst_unique_id_in_manifest=1 --wal_bytes_per_sync=0 --wal_compression=none --write_buffer_size=33554432 --write_dbid_to_manifest=0 --write_fault_one_in=0 --writepercent=35
```
Reviewed By: cbi42
Differential Revision: D60890580
Pulled By: hx235
fbshipit-source-id: 401f90d6d351c7ee11088cad06fb00e54062d416
2024-08-09 21:51:36 +00:00
|
|
|
uint64_t wait_for_recover_start_time = 0;
|
2024-07-29 20:51:49 +00:00
|
|
|
do {
|
|
|
|
// In order to commit the expected state for the initial write failed with
|
|
|
|
// injected retryable error and successful WAL write, retry the write
|
|
|
|
// until it succeeds after the recovery finishes
|
|
|
|
if (!s.ok() && IsErrorInjectedAndRetryable(s) &&
|
|
|
|
initial_wal_write_may_succeed) {
|
|
|
|
std::this_thread::sleep_for(std::chrono::microseconds(1 * 1000 * 1000));
|
|
|
|
}
|
|
|
|
if (FLAGS_use_put_entity_one_in > 0 &&
|
|
|
|
(value_base % FLAGS_use_put_entity_one_in) == 0) {
|
|
|
|
if (!FLAGS_use_txn) {
|
|
|
|
if (FLAGS_use_attribute_group) {
|
|
|
|
s = db_->PutEntity(write_opts, k,
|
|
|
|
GenerateAttributeGroups({cfh}, value_base, v));
|
|
|
|
} else {
|
|
|
|
s = db_->PutEntity(write_opts, cfh, k,
|
|
|
|
GenerateWideColumns(value_base, v));
|
|
|
|
}
|
2024-05-22 18:30:33 +00:00
|
|
|
} else {
|
2024-07-29 20:51:49 +00:00
|
|
|
s = ExecuteTransaction(write_opts, thread, [&](Transaction& txn) {
|
|
|
|
return txn.PutEntity(cfh, k, GenerateWideColumns(value_base, v));
|
|
|
|
});
|
2024-05-22 18:30:33 +00:00
|
|
|
}
|
2024-07-29 20:51:49 +00:00
|
|
|
} else if (FLAGS_use_timed_put_one_in > 0 &&
|
|
|
|
((value_base + kLargePrimeForCommonFactorSkew) %
|
|
|
|
FLAGS_use_timed_put_one_in) == 0) {
|
|
|
|
WriteBatch wb;
|
|
|
|
uint64_t write_unix_time = GetWriteUnixTime(thread);
|
|
|
|
s = wb.TimedPut(cfh, k, v, write_unix_time);
|
|
|
|
if (s.ok()) {
|
|
|
|
s = db_->Write(write_opts, &wb);
|
|
|
|
}
|
|
|
|
} else if (FLAGS_use_merge) {
|
|
|
|
if (!FLAGS_use_txn) {
|
|
|
|
if (FLAGS_user_timestamp_size == 0) {
|
|
|
|
s = db_->Merge(write_opts, cfh, k, v);
|
|
|
|
} else {
|
|
|
|
s = db_->Merge(write_opts, cfh, k, write_ts, v);
|
|
|
|
}
|
2022-11-18 04:43:50 +00:00
|
|
|
} else {
|
2024-07-29 20:51:49 +00:00
|
|
|
s = ExecuteTransaction(write_opts, thread, [&](Transaction& txn) {
|
|
|
|
return txn.Merge(cfh, k, v);
|
|
|
|
});
|
2022-11-18 04:43:50 +00:00
|
|
|
}
|
2019-12-09 07:49:32 +00:00
|
|
|
} else {
|
2024-07-29 20:51:49 +00:00
|
|
|
if (!FLAGS_use_txn) {
|
|
|
|
if (FLAGS_user_timestamp_size == 0) {
|
|
|
|
s = db_->Put(write_opts, cfh, k, v);
|
|
|
|
} else {
|
|
|
|
s = db_->Put(write_opts, cfh, k, write_ts, v);
|
|
|
|
}
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
} else {
|
2024-07-29 20:51:49 +00:00
|
|
|
s = ExecuteTransaction(write_opts, thread, [&](Transaction& txn) {
|
|
|
|
return txn.Put(cfh, k, v);
|
|
|
|
});
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
}
|
2019-12-09 07:49:32 +00:00
|
|
|
}
|
Fix false-positive TestBackupRestore corruption (#12917)
Summary:
**Context:**
https://github.com/facebook/rocksdb/pull/12838 allows a write thread encountered certain injected error to release the lock and sleep before retrying write in order to reduce performance cost. This requires adding checks like [this](https://github.com/facebook/rocksdb/blob/b26b395e0a15255d322be08110db551976188745/db_stress_tool/expected_value.cc#L29-L31) to prevent writing to the same key from another thread.
The added check causes a false-positive failure when delete range + file ingestion + backup is used. Consider the following scenario:
(1) Issue a delete range covering some key that do not exist and a key does exist (named as k1). k1 will have "pending delete" state while the keys that does not exit will have whatever state they already have since we don't delete a key that does not exist already.
(2) After https://github.com/facebook/rocksdb/pull/12838, `PrepareDeleteRange(... &prepared)` will return `prepared = false`. So below logic will be executed and k1's "pending delete" won't get roll-backed nor committed.
```
std::vector<PendingExpectedValue> pending_expected_values =
shared->PrepareDeleteRange(rand_column_family, rand_key,
rand_key + FLAGS_range_deletion_width,
&prepared);
if (!prepared) {
for (PendingExpectedValue& pending_expected_value :
pending_expected_values) {
pending_expected_value.PermitUnclosedPendingState();
}
return s;
}
```
(3) Issue an file ingestion covering k1 and another key k2. Similar to (2), we will have `shared->PreparePut(column_family, key, &prepared)` return `prepared = false` for k1 while k2 will have a "pending put" state. So below logic will be executed and k2's "pending put" state won't get roll-backed nor committed.
```
for (int64_t key = key_base;
s.ok() && key < shared->GetMaxKey() &&
static_cast<int32_t>(keys.size()) < FLAGS_ingest_external_file_width;
++key)
PendingExpectedValue pending_expected_value =
shared->PreparePut(column_family, key, &prepared);
if (!prepared) {
pending_expected_value.PermitUnclosedPendingState();
for (PendingExpectedValue& pev : pending_expected_values) {
pev.PermitUnclosedPendingState();
}
return;
}
}
```
(4) Issue a backup and verify on k2. Below logic decides that k2 should exist in restored DB since it has a pending write state while k2 is never ingested into the original DB as (3) returns early.
```
bool Exists() const { return PendingPut() || !IsDeleted(); }
TestBackupRestore() {
...
Status get_status = restored_db->Get(
read_opts, restored_cf_handles[rand_column_families[i]], key,
&restored_value);
bool exists = thread->shared->Exists(rand_column_families[i], rand_keys[0]);
if (get_status.ok()) {
if (!exists && from_latest && ShouldAcquireMutexOnKey()) {
std::ostringstream oss;
oss << "0x" << key.ToString(true)
<< " exists in restore but not in original db";
s = Status::Corruption(oss.str());
}
} else if (get_status.IsNotFound()) {
if (exists && from_latest && ShouldAcquireMutexOnKey()) {
std::ostringstream oss;
oss << "0x" << key.ToString(true)
<< " exists in original db but not in restore";
s = Status::Corruption(oss.str());
}
}
...
}
```
So we see false-positive corruption like `Failure in a backup/restore operation with: Corruption: 0x000000000000017B0000000000000073787878 exists in original db but not in restore`
A simple fix is to remove `PendingPut()` from `bool Exists() ` since it's called under a lock and should never see a pending write. However, in order for "under a lock and should never see a pending write" to be true, we need to remove the logic of releasing the lock during sleep in the write thread, which expose pending write to other thread that can call Exists() like back up thread.
The downside of holding lock during sleep is blocking other write thread of the same key to proceed cuz they need to wait for the lock. This should happen rarely as the key of a thread is selected randomly in crash test like below.
```
void StressTest::OperateDb(ThreadState* thread) {
for (uint64_t i = 0; i < ops_per_open; i++) {
...
int64_t rand_key = GenerateOneKey(thread, i);
...
}
}
```
**Summary:**
- Removed the "lock release" part and related checks
- Printed recovery time if the write thread waited more than 10 seconds
- Reverted regression in testing coverage when deleting a non-existent key
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12917
Test Plan:
Below command repro-ed frequently before the fix and not after.
```
./db_stress --WAL_size_limit_MB=1 --WAL_ttl_seconds=60 --acquire_snapshot_one_in=0 --adaptive_readahead=0 --adm_policy=1 --advise_random_on_open=1 --allow_concurrent_memtable_write=0 --allow_data_in_errors=True --allow_fallocate=0 --allow_setting_blob_options_dynamically=1 --async_io=0 --auto_readahead_size=1 --avoid_flush_during_recovery=0 --avoid_flush_during_shutdown=0 --avoid_unnecessary_blocking_io=0 --backup_max_size=104857600 --backup_one_in=100000 --batch_protection_bytes_per_key=0 --bgerror_resume_retry_interval=100 --blob_cache_size=8388608 --blob_compaction_readahead_size=1048576 --blob_compression_type=none --blob_file_size=1073741824 --blob_file_starting_level=1 --blob_garbage_collection_age_cutoff=0.0 --blob_garbage_collection_force_threshold=0.75 --block_align=0 --block_protection_bytes_per_key=8 --block_size=16384 --bloom_before_level=2147483647 --bloom_bits=16.216959977115277 --bottommost_compression_type=xpress --bottommost_file_compaction_delay=600 --bytes_per_sync=262144 --cache_index_and_filter_blocks=1 --cache_index_and_filter_blocks_with_high_priority=1 --cache_size=8388608 --cache_type=lru_cache --charge_compression_dictionary_building_buffer=1 --charge_file_metadata=0 --charge_filter_construction=0 --charge_table_reader=1 --check_multiget_consistency=0 --check_multiget_entity_consistency=0 --checkpoint_one_in=1000000 --checksum_type=kXXH3 --clear_column_family_one_in=0 --column_families=1 --compact_files_one_in=1000 --compact_range_one_in=0 --compaction_pri=3 --compaction_readahead_size=0 --compaction_ttl=10 --compress_format_version=2 --compressed_secondary_cache_size=8388608 --compression_checksum=0 --compression_max_dict_buffer_bytes=2097151 --compression_max_dict_bytes=16384 --compression_parallel_threads=1 --compression_type=zlib --compression_use_zstd_dict_trainer=0 --compression_zstd_max_train_bytes=0 --continuous_verification_interval=0 --daily_offpeak_time_utc=04:00-08:00 --data_block_index_type=0 --db=/dev/shm/rocksdb_test/rocksdb_crashtest_whitebox --db_write_buffer_size=0 --default_temperature=kUnknown --default_write_temperature=kWarm --delete_obsolete_files_period_micros=21600000000 --delpercent=0 --delrangepercent=5 --destroy_db_initially=0 --detect_filter_construct_corruption=1 --disable_file_deletions_one_in=10000 --disable_manual_compaction_one_in=1000000 --disable_wal=0 --dump_malloc_stats=0 --enable_blob_files=0 --enable_blob_garbage_collection=1 --enable_checksum_handoff=1 --enable_compaction_filter=1 --enable_custom_split_merge=1 --enable_do_not_compress_roles=0 --enable_index_compression=1 --enable_memtable_insert_with_hint_prefix_extractor=0 --enable_pipelined_write=1 --enable_sst_partitioner_factory=1 --enable_thread_tracking=0 --enable_write_thread_adaptive_yield=0 --error_recovery_with_no_fault_injection=1 --exclude_wal_from_write_fault_injection=1 --expected_values_dir=/dev/shm/rocksdb_test/rocksdb_crashtest_expected --fail_if_options_file_error=0 --fifo_allow_compaction=1 --file_checksum_impl=big --fill_cache=1 --flush_one_in=1000000 --format_version=2 --get_all_column_family_metadata_one_in=10000 --get_current_wal_file_one_in=0 --get_live_files_apis_one_in=1000000 --get_properties_of_all_tables_one_in=100000 --get_property_one_in=100000 --get_sorted_wal_files_one_in=0 --hard_pending_compaction_bytes_limit=2097152 --high_pri_pool_ratio=0.5 --index_block_restart_interval=1 --index_shortening=2 --index_type=0 --ingest_external_file_one_in=1000 --initial_auto_readahead_size=0 --inplace_update_support=0 --iterpercent=0 --key_len_percent_dist=1,30,69 --key_may_exist_one_in=100 --last_level_temperature=kUnknown --level_compaction_dynamic_level_bytes=0 --lock_wal_one_in=10000 --log2_keys_per_lock=10 --log_file_time_to_roll=0 --log_readahead_size=0 --long_running_snapshots=1 --low_pri_pool_ratio=0.5 --lowest_used_cache_tier=1 --manifest_preallocation_size=0 --manual_wal_flush_one_in=0 --mark_for_compaction_one_file_in=10 --max_auto_readahead_size=16384 --max_background_compactions=1 --max_bytes_for_level_base=67108864 --max_key=100000 --max_key_len=3 --max_log_file_size=1048576 --max_manifest_file_size=1073741824 --max_sequential_skip_in_iterations=16 --max_total_wal_size=0 --max_write_batch_group_size_bytes=16 --max_write_buffer_number=10 --max_write_buffer_size_to_maintain=8388608 --memtable_insert_hint_per_batch=1 --memtable_max_range_deletions=1000 --memtable_prefix_bloom_size_ratio=0.001 --memtable_protection_bytes_per_key=4 --memtable_whole_key_filtering=1 --memtablerep=skip_list --metadata_charge_policy=1 --metadata_read_fault_one_in=0 --metadata_write_fault_one_in=0 --min_blob_size=16 --min_write_buffer_number_to_merge=2 --mmap_read=0 --mock_direct_io=False --nooverwritepercent=1 --num_file_reads_for_auto_readahead=0 --open_files=-1 --open_metadata_read_fault_one_in=0 --open_metadata_write_fault_one_in=0 --open_read_fault_one_in=0 --open_write_fault_one_in=0 --ops_per_thread=20000000 --optimize_filters_for_hits=1 --optimize_filters_for_memory=0 --optimize_multiget_for_io=1 --paranoid_file_checks=1 --partition_filters=0 --partition_pinning=1 --pause_background_one_in=10000 --periodic_compaction_seconds=10 --prefix_size=8 --prefixpercent=0 --prepopulate_blob_cache=1 --prepopulate_block_cache=1 --preserve_internal_time_seconds=0 --progress_reports=0 --promote_l0_one_in=0 --read_amp_bytes_per_bit=0 --read_fault_one_in=0 --readahead_size=524288 --readpercent=60 --recycle_log_file_num=1 --reopen=20 --report_bg_io_stats=0 --reset_stats_one_in=1000000 --sample_for_compression=5 --secondary_cache_fault_one_in=0 --secondary_cache_uri= --skip_stats_update_on_db_open=1 --snapshot_hold_ops=100000 --soft_pending_compaction_bytes_limit=68719476736 --sqfc_name=foo --sqfc_version=1 --sst_file_manager_bytes_per_sec=0 --sst_file_manager_bytes_per_truncate=0 --stats_dump_period_sec=10 --stats_history_buffer_size=1048576 --strict_bytes_per_sync=1 --subcompactions=2 --sync=0 --sync_fault_injection=0 --table_cache_numshardbits=0 --target_file_size_base=16777216 --target_file_size_multiplier=1 --test_batches_snapshots=0 --top_level_index_pinning=3 --uncache_aggressiveness=118 --universal_max_read_amp=-1 --unpartitioned_pinning=0 --use_adaptive_mutex=0 --use_adaptive_mutex_lru=1 --use_attribute_group=0 --use_blob_cache=0 --use_delta_encoding=1 --use_direct_io_for_flush_and_compaction=0 --use_direct_reads=0 --use_full_merge_v1=0 --use_get_entity=0 --use_merge=0 --use_multi_cf_iterator=0 --use_multi_get_entity=0 --use_multiget=1 --use_put_entity_one_in=0 --use_shared_block_and_blob_cache=1 --use_sqfc_for_range_queries=1 --use_timed_put_one_in=0 --use_write_buffer_manager=0 --user_timestamp_size=0 --value_size_mult=32 --verification_only=0 --verify_checksum=1 --verify_checksum_one_in=1000000 --verify_compression=0 --verify_db_one_in=10000 --verify_file_checksums_one_in=1000000 --verify_iterator_with_expected_state_one_in=5 --verify_sst_unique_id_in_manifest=1 --wal_bytes_per_sync=0 --wal_compression=none --write_buffer_size=33554432 --write_dbid_to_manifest=0 --write_fault_one_in=0 --writepercent=35
```
Reviewed By: cbi42
Differential Revision: D60890580
Pulled By: hx235
fbshipit-source-id: 401f90d6d351c7ee11088cad06fb00e54062d416
2024-08-09 21:51:36 +00:00
|
|
|
UpdateIfInitialWriteFails(db_stress_env, s, &initial_write_s,
|
|
|
|
&initial_wal_write_may_succeed,
|
|
|
|
&wait_for_recover_start_time);
|
|
|
|
|
2024-07-29 20:51:49 +00:00
|
|
|
} while (!s.ok() && IsErrorInjectedAndRetryable(s) &&
|
|
|
|
initial_wal_write_may_succeed);
|
Add the PutEntity API to the stress/crash tests (#10760)
Summary:
The patch adds the `PutEntity` API to the non-batched, batched, and
CF consistency stress tests. Namely, when the new `db_stress` command
line parameter `use_put_entity_one_in` is greater than zero, one in
N writes on average is performed using `PutEntity` rather than `Put`.
The wide-column entity written has the generated value in its default
column; in addition, it contains up to three additional columns where
the original generated value is divided up between the column name and the
column value (with the column name containing the first k characters of
the generated value, and the column value containing the rest). Whether
`PutEntity` is used (and if so, how many columns the entity has) is completely
determined by the "value base" used to generate the value (that is, there is
no randomness involved). Assuming the same `use_put_entity_one_in` setting
is used across `db_stress` invocations, this enables us to reconstruct and
validate the entity during subsequent `db_stress` runs.
Note that `PutEntity` is currently incompatible with `Merge`, transactions, and
user-defined timestamps; these combinations are currently disabled/disallowed.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10760
Test Plan: Ran some batched, non-batched, and CF consistency stress tests using the script.
Reviewed By: riversand963
Differential Revision: D39939032
Pulled By: ltamasi
fbshipit-source-id: eafdf124e95993fb7d73158e3b006d11819f7fa9
2022-09-30 18:11:07 +00:00
|
|
|
|
2019-12-09 07:49:32 +00:00
|
|
|
if (!s.ok()) {
|
2024-01-16 21:28:51 +00:00
|
|
|
pending_expected_value.Rollback();
|
2024-06-25 03:51:39 +00:00
|
|
|
if (IsErrorInjectedAndRetryable(s)) {
|
2024-07-29 20:51:49 +00:00
|
|
|
assert(!initial_wal_write_may_succeed);
|
Inject more errors to more files in stress test (#12713)
Summary:
**Context:**
We currently have partial error injection:
- DB operation: all read, SST write
- DB open: all read, SST write, all metadata write.
This PR completes the error injection (with some limitations below):
- DB operation & open: all read, all write, all metadata write, all metadata read
**Summary:**
- Inject retryable metadata read, metadata write error concerning directory (e.g, dir sync, ) or file metadata (e.g, name, size, file creation/deletion...)
- Inject retryable errors to all major file types: random access file, sequential file, writable file
- Allow db stress test operations to handle above injected errors gracefully without crashing
- Change all error injection to thread-local implementation for easier disabling and enabling in the same thread. For example, we can control error handling thread to have no error injection. It's also cleaner in code.
- Limitation: compared to before, we now don't have write fault injection for backup/restore CopyOrCreateFiles work threads since they use anonymous background threads as well as read injection for db open bg thread
- Add a new flag to test error recovery without error injection so we can test the path where error recovery actually succeeds
- Some Refactory & fix to db stress test framework (see PR review comments)
- Fix some minor bugs surfaced (see PR review comments)
- Limitation: had to disable backup restore with metadata read/write injection since it surfaces too many testing issues. Will add it back later to focus on surfacing actual code/internal bugs first.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12713
Test Plan:
- Existing UT
- CI with no trivial error failure
Reviewed By: pdillinger
Differential Revision: D58326608
Pulled By: hx235
fbshipit-source-id: 011b5195aaeb6011641ae0a9194f7f2a0e325ad7
2024-06-19 15:42:00 +00:00
|
|
|
return s;
|
|
|
|
} else if (FLAGS_inject_error_severity == 2) {
|
2021-07-01 21:15:49 +00:00
|
|
|
if (!is_db_stopped_ && s.severity() >= Status::Severity::kFatalError) {
|
|
|
|
is_db_stopped_ = true;
|
|
|
|
} else if (!is_db_stopped_ ||
|
|
|
|
s.severity() < Status::Severity::kFatalError) {
|
|
|
|
fprintf(stderr, "put or merge error: %s\n", s.ToString().c_str());
|
Allow TryAgain in db_stress with optimistic txn, and refactoring (#11653)
Summary:
In rare cases, optimistic transaction commit returns TryAgain. This change tolerates that intentional behavior in db_stress, up to a small limit in a row. This way, we don't miss a possible regression with excessive TryAgain, and trying again (rolling back the transaction) should have a well renewed chance of success as the writes will be associated with fresh sequence numbers.
Also, some of the APIs were not clear about Transaction semantics, so I have clarified:
* (Best I can tell....) Destroying a Transaction is safe without calling Rollback() (or at least should be). I don't know why it's a common pattern in our test code and examples to rollback before unconditional destruction. Stress test updated not to call Rollback unnecessarily (to test safe destruction).
* Despite essentially doing what is asked, simply trying Commit() again when it returns TryAgain does not have a chance of success, because of the transaction being bound to the DB state at the time of operations before Commit. Similar logic applies to Busy AFAIK. Commit() API comments updated, and expanded unit test in optimistic_transaction_test.
Also also, because I can't stop myself, I refactored a good portion of the transaction handling code in db_stress.
* Avoid existing and new copy-paste for most transaction interactions with a new ExecuteTransaction (higher-order) function.
* Use unique_ptr (nicely complements removing unnecessary Rollbacks)
* Abstract out a pattern for safely calling std::terminate() and use it in more places. (The TryAgain errors we saw did not have stack traces because of "terminate called recursively".)
Intended follow-up: resurrect use of `FLAGS_rollback_one_in` but also include non-trivial cases
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11653
Test Plan:
this is the test :)
Also, temporarily bypassed the new retry logic and boosted the chance of hitting TryAgain. Quickly reproduced the TryAgain error. Then re-enabled the new retry logic, and was not able to hit the error after running for tens of minutes, even with the boosted chances.
Reviewed By: cbi42
Differential Revision: D47882995
Pulled By: pdillinger
fbshipit-source-id: 21eadb1525423340dbf28d17cf166b9583311a0d
2023-07-28 23:25:29 +00:00
|
|
|
thread->shared->SafeTerminate();
|
2021-07-01 21:15:49 +00:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
fprintf(stderr, "put or merge error: %s\n", s.ToString().c_str());
|
Allow TryAgain in db_stress with optimistic txn, and refactoring (#11653)
Summary:
In rare cases, optimistic transaction commit returns TryAgain. This change tolerates that intentional behavior in db_stress, up to a small limit in a row. This way, we don't miss a possible regression with excessive TryAgain, and trying again (rolling back the transaction) should have a well renewed chance of success as the writes will be associated with fresh sequence numbers.
Also, some of the APIs were not clear about Transaction semantics, so I have clarified:
* (Best I can tell....) Destroying a Transaction is safe without calling Rollback() (or at least should be). I don't know why it's a common pattern in our test code and examples to rollback before unconditional destruction. Stress test updated not to call Rollback unnecessarily (to test safe destruction).
* Despite essentially doing what is asked, simply trying Commit() again when it returns TryAgain does not have a chance of success, because of the transaction being bound to the DB state at the time of operations before Commit. Similar logic applies to Busy AFAIK. Commit() API comments updated, and expanded unit test in optimistic_transaction_test.
Also also, because I can't stop myself, I refactored a good portion of the transaction handling code in db_stress.
* Avoid existing and new copy-paste for most transaction interactions with a new ExecuteTransaction (higher-order) function.
* Use unique_ptr (nicely complements removing unnecessary Rollbacks)
* Abstract out a pattern for safely calling std::terminate() and use it in more places. (The TryAgain errors we saw did not have stack traces because of "terminate called recursively".)
Intended follow-up: resurrect use of `FLAGS_rollback_one_in` but also include non-trivial cases
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11653
Test Plan:
this is the test :)
Also, temporarily bypassed the new retry logic and boosted the chance of hitting TryAgain. Quickly reproduced the TryAgain error. Then re-enabled the new retry logic, and was not able to hit the error after running for tens of minutes, even with the boosted chances.
Reviewed By: cbi42
Differential Revision: D47882995
Pulled By: pdillinger
fbshipit-source-id: 21eadb1525423340dbf28d17cf166b9583311a0d
2023-07-28 23:25:29 +00:00
|
|
|
thread->shared->SafeTerminate();
|
2021-07-01 21:15:49 +00:00
|
|
|
}
|
2024-07-29 20:51:49 +00:00
|
|
|
} else {
|
Fix false-positive TestBackupRestore corruption (#12917)
Summary:
**Context:**
https://github.com/facebook/rocksdb/pull/12838 allows a write thread encountered certain injected error to release the lock and sleep before retrying write in order to reduce performance cost. This requires adding checks like [this](https://github.com/facebook/rocksdb/blob/b26b395e0a15255d322be08110db551976188745/db_stress_tool/expected_value.cc#L29-L31) to prevent writing to the same key from another thread.
The added check causes a false-positive failure when delete range + file ingestion + backup is used. Consider the following scenario:
(1) Issue a delete range covering some key that do not exist and a key does exist (named as k1). k1 will have "pending delete" state while the keys that does not exit will have whatever state they already have since we don't delete a key that does not exist already.
(2) After https://github.com/facebook/rocksdb/pull/12838, `PrepareDeleteRange(... &prepared)` will return `prepared = false`. So below logic will be executed and k1's "pending delete" won't get roll-backed nor committed.
```
std::vector<PendingExpectedValue> pending_expected_values =
shared->PrepareDeleteRange(rand_column_family, rand_key,
rand_key + FLAGS_range_deletion_width,
&prepared);
if (!prepared) {
for (PendingExpectedValue& pending_expected_value :
pending_expected_values) {
pending_expected_value.PermitUnclosedPendingState();
}
return s;
}
```
(3) Issue an file ingestion covering k1 and another key k2. Similar to (2), we will have `shared->PreparePut(column_family, key, &prepared)` return `prepared = false` for k1 while k2 will have a "pending put" state. So below logic will be executed and k2's "pending put" state won't get roll-backed nor committed.
```
for (int64_t key = key_base;
s.ok() && key < shared->GetMaxKey() &&
static_cast<int32_t>(keys.size()) < FLAGS_ingest_external_file_width;
++key)
PendingExpectedValue pending_expected_value =
shared->PreparePut(column_family, key, &prepared);
if (!prepared) {
pending_expected_value.PermitUnclosedPendingState();
for (PendingExpectedValue& pev : pending_expected_values) {
pev.PermitUnclosedPendingState();
}
return;
}
}
```
(4) Issue a backup and verify on k2. Below logic decides that k2 should exist in restored DB since it has a pending write state while k2 is never ingested into the original DB as (3) returns early.
```
bool Exists() const { return PendingPut() || !IsDeleted(); }
TestBackupRestore() {
...
Status get_status = restored_db->Get(
read_opts, restored_cf_handles[rand_column_families[i]], key,
&restored_value);
bool exists = thread->shared->Exists(rand_column_families[i], rand_keys[0]);
if (get_status.ok()) {
if (!exists && from_latest && ShouldAcquireMutexOnKey()) {
std::ostringstream oss;
oss << "0x" << key.ToString(true)
<< " exists in restore but not in original db";
s = Status::Corruption(oss.str());
}
} else if (get_status.IsNotFound()) {
if (exists && from_latest && ShouldAcquireMutexOnKey()) {
std::ostringstream oss;
oss << "0x" << key.ToString(true)
<< " exists in original db but not in restore";
s = Status::Corruption(oss.str());
}
}
...
}
```
So we see false-positive corruption like `Failure in a backup/restore operation with: Corruption: 0x000000000000017B0000000000000073787878 exists in original db but not in restore`
A simple fix is to remove `PendingPut()` from `bool Exists() ` since it's called under a lock and should never see a pending write. However, in order for "under a lock and should never see a pending write" to be true, we need to remove the logic of releasing the lock during sleep in the write thread, which expose pending write to other thread that can call Exists() like back up thread.
The downside of holding lock during sleep is blocking other write thread of the same key to proceed cuz they need to wait for the lock. This should happen rarely as the key of a thread is selected randomly in crash test like below.
```
void StressTest::OperateDb(ThreadState* thread) {
for (uint64_t i = 0; i < ops_per_open; i++) {
...
int64_t rand_key = GenerateOneKey(thread, i);
...
}
}
```
**Summary:**
- Removed the "lock release" part and related checks
- Printed recovery time if the write thread waited more than 10 seconds
- Reverted regression in testing coverage when deleting a non-existent key
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12917
Test Plan:
Below command repro-ed frequently before the fix and not after.
```
./db_stress --WAL_size_limit_MB=1 --WAL_ttl_seconds=60 --acquire_snapshot_one_in=0 --adaptive_readahead=0 --adm_policy=1 --advise_random_on_open=1 --allow_concurrent_memtable_write=0 --allow_data_in_errors=True --allow_fallocate=0 --allow_setting_blob_options_dynamically=1 --async_io=0 --auto_readahead_size=1 --avoid_flush_during_recovery=0 --avoid_flush_during_shutdown=0 --avoid_unnecessary_blocking_io=0 --backup_max_size=104857600 --backup_one_in=100000 --batch_protection_bytes_per_key=0 --bgerror_resume_retry_interval=100 --blob_cache_size=8388608 --blob_compaction_readahead_size=1048576 --blob_compression_type=none --blob_file_size=1073741824 --blob_file_starting_level=1 --blob_garbage_collection_age_cutoff=0.0 --blob_garbage_collection_force_threshold=0.75 --block_align=0 --block_protection_bytes_per_key=8 --block_size=16384 --bloom_before_level=2147483647 --bloom_bits=16.216959977115277 --bottommost_compression_type=xpress --bottommost_file_compaction_delay=600 --bytes_per_sync=262144 --cache_index_and_filter_blocks=1 --cache_index_and_filter_blocks_with_high_priority=1 --cache_size=8388608 --cache_type=lru_cache --charge_compression_dictionary_building_buffer=1 --charge_file_metadata=0 --charge_filter_construction=0 --charge_table_reader=1 --check_multiget_consistency=0 --check_multiget_entity_consistency=0 --checkpoint_one_in=1000000 --checksum_type=kXXH3 --clear_column_family_one_in=0 --column_families=1 --compact_files_one_in=1000 --compact_range_one_in=0 --compaction_pri=3 --compaction_readahead_size=0 --compaction_ttl=10 --compress_format_version=2 --compressed_secondary_cache_size=8388608 --compression_checksum=0 --compression_max_dict_buffer_bytes=2097151 --compression_max_dict_bytes=16384 --compression_parallel_threads=1 --compression_type=zlib --compression_use_zstd_dict_trainer=0 --compression_zstd_max_train_bytes=0 --continuous_verification_interval=0 --daily_offpeak_time_utc=04:00-08:00 --data_block_index_type=0 --db=/dev/shm/rocksdb_test/rocksdb_crashtest_whitebox --db_write_buffer_size=0 --default_temperature=kUnknown --default_write_temperature=kWarm --delete_obsolete_files_period_micros=21600000000 --delpercent=0 --delrangepercent=5 --destroy_db_initially=0 --detect_filter_construct_corruption=1 --disable_file_deletions_one_in=10000 --disable_manual_compaction_one_in=1000000 --disable_wal=0 --dump_malloc_stats=0 --enable_blob_files=0 --enable_blob_garbage_collection=1 --enable_checksum_handoff=1 --enable_compaction_filter=1 --enable_custom_split_merge=1 --enable_do_not_compress_roles=0 --enable_index_compression=1 --enable_memtable_insert_with_hint_prefix_extractor=0 --enable_pipelined_write=1 --enable_sst_partitioner_factory=1 --enable_thread_tracking=0 --enable_write_thread_adaptive_yield=0 --error_recovery_with_no_fault_injection=1 --exclude_wal_from_write_fault_injection=1 --expected_values_dir=/dev/shm/rocksdb_test/rocksdb_crashtest_expected --fail_if_options_file_error=0 --fifo_allow_compaction=1 --file_checksum_impl=big --fill_cache=1 --flush_one_in=1000000 --format_version=2 --get_all_column_family_metadata_one_in=10000 --get_current_wal_file_one_in=0 --get_live_files_apis_one_in=1000000 --get_properties_of_all_tables_one_in=100000 --get_property_one_in=100000 --get_sorted_wal_files_one_in=0 --hard_pending_compaction_bytes_limit=2097152 --high_pri_pool_ratio=0.5 --index_block_restart_interval=1 --index_shortening=2 --index_type=0 --ingest_external_file_one_in=1000 --initial_auto_readahead_size=0 --inplace_update_support=0 --iterpercent=0 --key_len_percent_dist=1,30,69 --key_may_exist_one_in=100 --last_level_temperature=kUnknown --level_compaction_dynamic_level_bytes=0 --lock_wal_one_in=10000 --log2_keys_per_lock=10 --log_file_time_to_roll=0 --log_readahead_size=0 --long_running_snapshots=1 --low_pri_pool_ratio=0.5 --lowest_used_cache_tier=1 --manifest_preallocation_size=0 --manual_wal_flush_one_in=0 --mark_for_compaction_one_file_in=10 --max_auto_readahead_size=16384 --max_background_compactions=1 --max_bytes_for_level_base=67108864 --max_key=100000 --max_key_len=3 --max_log_file_size=1048576 --max_manifest_file_size=1073741824 --max_sequential_skip_in_iterations=16 --max_total_wal_size=0 --max_write_batch_group_size_bytes=16 --max_write_buffer_number=10 --max_write_buffer_size_to_maintain=8388608 --memtable_insert_hint_per_batch=1 --memtable_max_range_deletions=1000 --memtable_prefix_bloom_size_ratio=0.001 --memtable_protection_bytes_per_key=4 --memtable_whole_key_filtering=1 --memtablerep=skip_list --metadata_charge_policy=1 --metadata_read_fault_one_in=0 --metadata_write_fault_one_in=0 --min_blob_size=16 --min_write_buffer_number_to_merge=2 --mmap_read=0 --mock_direct_io=False --nooverwritepercent=1 --num_file_reads_for_auto_readahead=0 --open_files=-1 --open_metadata_read_fault_one_in=0 --open_metadata_write_fault_one_in=0 --open_read_fault_one_in=0 --open_write_fault_one_in=0 --ops_per_thread=20000000 --optimize_filters_for_hits=1 --optimize_filters_for_memory=0 --optimize_multiget_for_io=1 --paranoid_file_checks=1 --partition_filters=0 --partition_pinning=1 --pause_background_one_in=10000 --periodic_compaction_seconds=10 --prefix_size=8 --prefixpercent=0 --prepopulate_blob_cache=1 --prepopulate_block_cache=1 --preserve_internal_time_seconds=0 --progress_reports=0 --promote_l0_one_in=0 --read_amp_bytes_per_bit=0 --read_fault_one_in=0 --readahead_size=524288 --readpercent=60 --recycle_log_file_num=1 --reopen=20 --report_bg_io_stats=0 --reset_stats_one_in=1000000 --sample_for_compression=5 --secondary_cache_fault_one_in=0 --secondary_cache_uri= --skip_stats_update_on_db_open=1 --snapshot_hold_ops=100000 --soft_pending_compaction_bytes_limit=68719476736 --sqfc_name=foo --sqfc_version=1 --sst_file_manager_bytes_per_sec=0 --sst_file_manager_bytes_per_truncate=0 --stats_dump_period_sec=10 --stats_history_buffer_size=1048576 --strict_bytes_per_sync=1 --subcompactions=2 --sync=0 --sync_fault_injection=0 --table_cache_numshardbits=0 --target_file_size_base=16777216 --target_file_size_multiplier=1 --test_batches_snapshots=0 --top_level_index_pinning=3 --uncache_aggressiveness=118 --universal_max_read_amp=-1 --unpartitioned_pinning=0 --use_adaptive_mutex=0 --use_adaptive_mutex_lru=1 --use_attribute_group=0 --use_blob_cache=0 --use_delta_encoding=1 --use_direct_io_for_flush_and_compaction=0 --use_direct_reads=0 --use_full_merge_v1=0 --use_get_entity=0 --use_merge=0 --use_multi_cf_iterator=0 --use_multi_get_entity=0 --use_multiget=1 --use_put_entity_one_in=0 --use_shared_block_and_blob_cache=1 --use_sqfc_for_range_queries=1 --use_timed_put_one_in=0 --use_write_buffer_manager=0 --user_timestamp_size=0 --value_size_mult=32 --verification_only=0 --verify_checksum=1 --verify_checksum_one_in=1000000 --verify_compression=0 --verify_db_one_in=10000 --verify_file_checksums_one_in=1000000 --verify_iterator_with_expected_state_one_in=5 --verify_sst_unique_id_in_manifest=1 --wal_bytes_per_sync=0 --wal_compression=none --write_buffer_size=33554432 --write_dbid_to_manifest=0 --write_fault_one_in=0 --writepercent=35
```
Reviewed By: cbi42
Differential Revision: D60890580
Pulled By: hx235
fbshipit-source-id: 401f90d6d351c7ee11088cad06fb00e54062d416
2024-08-09 21:51:36 +00:00
|
|
|
PrintWriteRecoveryWaitTimeIfNeeded(
|
|
|
|
db_stress_env, initial_write_s, initial_wal_write_may_succeed,
|
|
|
|
wait_for_recover_start_time, "TestPut");
|
2024-07-29 20:51:49 +00:00
|
|
|
pending_expected_value.Commit();
|
|
|
|
thread->stats.AddBytesForWrites(1, sz);
|
|
|
|
PrintKeyValue(rand_column_family, static_cast<uint32_t>(rand_key), value,
|
|
|
|
sz);
|
2019-12-09 07:49:32 +00:00
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2019-12-09 22:36:10 +00:00
|
|
|
Status TestDelete(ThreadState* thread, WriteOptions& write_opts,
|
|
|
|
const std::vector<int>& rand_column_families,
|
2022-09-15 22:55:37 +00:00
|
|
|
const std::vector<int64_t>& rand_keys) override {
|
2019-12-09 07:49:32 +00:00
|
|
|
int64_t rand_key = rand_keys[0];
|
|
|
|
int rand_column_family = rand_column_families[0];
|
|
|
|
auto shared = thread->shared;
|
|
|
|
|
2022-09-15 22:55:37 +00:00
|
|
|
std::unique_ptr<MutexLock> lock(
|
|
|
|
new MutexLock(shared->GetMutexForKey(rand_column_family, rand_key)));
|
|
|
|
|
2019-12-09 07:49:32 +00:00
|
|
|
// OPERATION delete
|
2022-07-05 20:30:15 +00:00
|
|
|
std::string write_ts_str = GetNowNanos();
|
2022-05-02 23:19:00 +00:00
|
|
|
Slice write_ts = write_ts_str;
|
2019-12-09 07:49:32 +00:00
|
|
|
|
|
|
|
std::string key_str = Key(rand_key);
|
|
|
|
Slice key = key_str;
|
|
|
|
auto cfh = column_families_[rand_column_family];
|
|
|
|
|
2024-07-29 20:51:49 +00:00
|
|
|
// To track the final write status
|
|
|
|
Status s;
|
|
|
|
// To track the initial write status
|
|
|
|
Status initial_write_s;
|
|
|
|
// To track whether WAL write may have succeeded during the initial failed
|
|
|
|
// write
|
|
|
|
bool initial_wal_write_may_succeed = true;
|
|
|
|
|
2019-12-09 07:49:32 +00:00
|
|
|
// Use delete if the key may be overwritten and a single deletion
|
|
|
|
// otherwise.
|
|
|
|
if (shared->AllowsOverwrite(rand_key)) {
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
PendingExpectedValue pending_expected_value =
|
Fix false-positive TestBackupRestore corruption (#12917)
Summary:
**Context:**
https://github.com/facebook/rocksdb/pull/12838 allows a write thread encountered certain injected error to release the lock and sleep before retrying write in order to reduce performance cost. This requires adding checks like [this](https://github.com/facebook/rocksdb/blob/b26b395e0a15255d322be08110db551976188745/db_stress_tool/expected_value.cc#L29-L31) to prevent writing to the same key from another thread.
The added check causes a false-positive failure when delete range + file ingestion + backup is used. Consider the following scenario:
(1) Issue a delete range covering some key that do not exist and a key does exist (named as k1). k1 will have "pending delete" state while the keys that does not exit will have whatever state they already have since we don't delete a key that does not exist already.
(2) After https://github.com/facebook/rocksdb/pull/12838, `PrepareDeleteRange(... &prepared)` will return `prepared = false`. So below logic will be executed and k1's "pending delete" won't get roll-backed nor committed.
```
std::vector<PendingExpectedValue> pending_expected_values =
shared->PrepareDeleteRange(rand_column_family, rand_key,
rand_key + FLAGS_range_deletion_width,
&prepared);
if (!prepared) {
for (PendingExpectedValue& pending_expected_value :
pending_expected_values) {
pending_expected_value.PermitUnclosedPendingState();
}
return s;
}
```
(3) Issue an file ingestion covering k1 and another key k2. Similar to (2), we will have `shared->PreparePut(column_family, key, &prepared)` return `prepared = false` for k1 while k2 will have a "pending put" state. So below logic will be executed and k2's "pending put" state won't get roll-backed nor committed.
```
for (int64_t key = key_base;
s.ok() && key < shared->GetMaxKey() &&
static_cast<int32_t>(keys.size()) < FLAGS_ingest_external_file_width;
++key)
PendingExpectedValue pending_expected_value =
shared->PreparePut(column_family, key, &prepared);
if (!prepared) {
pending_expected_value.PermitUnclosedPendingState();
for (PendingExpectedValue& pev : pending_expected_values) {
pev.PermitUnclosedPendingState();
}
return;
}
}
```
(4) Issue a backup and verify on k2. Below logic decides that k2 should exist in restored DB since it has a pending write state while k2 is never ingested into the original DB as (3) returns early.
```
bool Exists() const { return PendingPut() || !IsDeleted(); }
TestBackupRestore() {
...
Status get_status = restored_db->Get(
read_opts, restored_cf_handles[rand_column_families[i]], key,
&restored_value);
bool exists = thread->shared->Exists(rand_column_families[i], rand_keys[0]);
if (get_status.ok()) {
if (!exists && from_latest && ShouldAcquireMutexOnKey()) {
std::ostringstream oss;
oss << "0x" << key.ToString(true)
<< " exists in restore but not in original db";
s = Status::Corruption(oss.str());
}
} else if (get_status.IsNotFound()) {
if (exists && from_latest && ShouldAcquireMutexOnKey()) {
std::ostringstream oss;
oss << "0x" << key.ToString(true)
<< " exists in original db but not in restore";
s = Status::Corruption(oss.str());
}
}
...
}
```
So we see false-positive corruption like `Failure in a backup/restore operation with: Corruption: 0x000000000000017B0000000000000073787878 exists in original db but not in restore`
A simple fix is to remove `PendingPut()` from `bool Exists() ` since it's called under a lock and should never see a pending write. However, in order for "under a lock and should never see a pending write" to be true, we need to remove the logic of releasing the lock during sleep in the write thread, which expose pending write to other thread that can call Exists() like back up thread.
The downside of holding lock during sleep is blocking other write thread of the same key to proceed cuz they need to wait for the lock. This should happen rarely as the key of a thread is selected randomly in crash test like below.
```
void StressTest::OperateDb(ThreadState* thread) {
for (uint64_t i = 0; i < ops_per_open; i++) {
...
int64_t rand_key = GenerateOneKey(thread, i);
...
}
}
```
**Summary:**
- Removed the "lock release" part and related checks
- Printed recovery time if the write thread waited more than 10 seconds
- Reverted regression in testing coverage when deleting a non-existent key
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12917
Test Plan:
Below command repro-ed frequently before the fix and not after.
```
./db_stress --WAL_size_limit_MB=1 --WAL_ttl_seconds=60 --acquire_snapshot_one_in=0 --adaptive_readahead=0 --adm_policy=1 --advise_random_on_open=1 --allow_concurrent_memtable_write=0 --allow_data_in_errors=True --allow_fallocate=0 --allow_setting_blob_options_dynamically=1 --async_io=0 --auto_readahead_size=1 --avoid_flush_during_recovery=0 --avoid_flush_during_shutdown=0 --avoid_unnecessary_blocking_io=0 --backup_max_size=104857600 --backup_one_in=100000 --batch_protection_bytes_per_key=0 --bgerror_resume_retry_interval=100 --blob_cache_size=8388608 --blob_compaction_readahead_size=1048576 --blob_compression_type=none --blob_file_size=1073741824 --blob_file_starting_level=1 --blob_garbage_collection_age_cutoff=0.0 --blob_garbage_collection_force_threshold=0.75 --block_align=0 --block_protection_bytes_per_key=8 --block_size=16384 --bloom_before_level=2147483647 --bloom_bits=16.216959977115277 --bottommost_compression_type=xpress --bottommost_file_compaction_delay=600 --bytes_per_sync=262144 --cache_index_and_filter_blocks=1 --cache_index_and_filter_blocks_with_high_priority=1 --cache_size=8388608 --cache_type=lru_cache --charge_compression_dictionary_building_buffer=1 --charge_file_metadata=0 --charge_filter_construction=0 --charge_table_reader=1 --check_multiget_consistency=0 --check_multiget_entity_consistency=0 --checkpoint_one_in=1000000 --checksum_type=kXXH3 --clear_column_family_one_in=0 --column_families=1 --compact_files_one_in=1000 --compact_range_one_in=0 --compaction_pri=3 --compaction_readahead_size=0 --compaction_ttl=10 --compress_format_version=2 --compressed_secondary_cache_size=8388608 --compression_checksum=0 --compression_max_dict_buffer_bytes=2097151 --compression_max_dict_bytes=16384 --compression_parallel_threads=1 --compression_type=zlib --compression_use_zstd_dict_trainer=0 --compression_zstd_max_train_bytes=0 --continuous_verification_interval=0 --daily_offpeak_time_utc=04:00-08:00 --data_block_index_type=0 --db=/dev/shm/rocksdb_test/rocksdb_crashtest_whitebox --db_write_buffer_size=0 --default_temperature=kUnknown --default_write_temperature=kWarm --delete_obsolete_files_period_micros=21600000000 --delpercent=0 --delrangepercent=5 --destroy_db_initially=0 --detect_filter_construct_corruption=1 --disable_file_deletions_one_in=10000 --disable_manual_compaction_one_in=1000000 --disable_wal=0 --dump_malloc_stats=0 --enable_blob_files=0 --enable_blob_garbage_collection=1 --enable_checksum_handoff=1 --enable_compaction_filter=1 --enable_custom_split_merge=1 --enable_do_not_compress_roles=0 --enable_index_compression=1 --enable_memtable_insert_with_hint_prefix_extractor=0 --enable_pipelined_write=1 --enable_sst_partitioner_factory=1 --enable_thread_tracking=0 --enable_write_thread_adaptive_yield=0 --error_recovery_with_no_fault_injection=1 --exclude_wal_from_write_fault_injection=1 --expected_values_dir=/dev/shm/rocksdb_test/rocksdb_crashtest_expected --fail_if_options_file_error=0 --fifo_allow_compaction=1 --file_checksum_impl=big --fill_cache=1 --flush_one_in=1000000 --format_version=2 --get_all_column_family_metadata_one_in=10000 --get_current_wal_file_one_in=0 --get_live_files_apis_one_in=1000000 --get_properties_of_all_tables_one_in=100000 --get_property_one_in=100000 --get_sorted_wal_files_one_in=0 --hard_pending_compaction_bytes_limit=2097152 --high_pri_pool_ratio=0.5 --index_block_restart_interval=1 --index_shortening=2 --index_type=0 --ingest_external_file_one_in=1000 --initial_auto_readahead_size=0 --inplace_update_support=0 --iterpercent=0 --key_len_percent_dist=1,30,69 --key_may_exist_one_in=100 --last_level_temperature=kUnknown --level_compaction_dynamic_level_bytes=0 --lock_wal_one_in=10000 --log2_keys_per_lock=10 --log_file_time_to_roll=0 --log_readahead_size=0 --long_running_snapshots=1 --low_pri_pool_ratio=0.5 --lowest_used_cache_tier=1 --manifest_preallocation_size=0 --manual_wal_flush_one_in=0 --mark_for_compaction_one_file_in=10 --max_auto_readahead_size=16384 --max_background_compactions=1 --max_bytes_for_level_base=67108864 --max_key=100000 --max_key_len=3 --max_log_file_size=1048576 --max_manifest_file_size=1073741824 --max_sequential_skip_in_iterations=16 --max_total_wal_size=0 --max_write_batch_group_size_bytes=16 --max_write_buffer_number=10 --max_write_buffer_size_to_maintain=8388608 --memtable_insert_hint_per_batch=1 --memtable_max_range_deletions=1000 --memtable_prefix_bloom_size_ratio=0.001 --memtable_protection_bytes_per_key=4 --memtable_whole_key_filtering=1 --memtablerep=skip_list --metadata_charge_policy=1 --metadata_read_fault_one_in=0 --metadata_write_fault_one_in=0 --min_blob_size=16 --min_write_buffer_number_to_merge=2 --mmap_read=0 --mock_direct_io=False --nooverwritepercent=1 --num_file_reads_for_auto_readahead=0 --open_files=-1 --open_metadata_read_fault_one_in=0 --open_metadata_write_fault_one_in=0 --open_read_fault_one_in=0 --open_write_fault_one_in=0 --ops_per_thread=20000000 --optimize_filters_for_hits=1 --optimize_filters_for_memory=0 --optimize_multiget_for_io=1 --paranoid_file_checks=1 --partition_filters=0 --partition_pinning=1 --pause_background_one_in=10000 --periodic_compaction_seconds=10 --prefix_size=8 --prefixpercent=0 --prepopulate_blob_cache=1 --prepopulate_block_cache=1 --preserve_internal_time_seconds=0 --progress_reports=0 --promote_l0_one_in=0 --read_amp_bytes_per_bit=0 --read_fault_one_in=0 --readahead_size=524288 --readpercent=60 --recycle_log_file_num=1 --reopen=20 --report_bg_io_stats=0 --reset_stats_one_in=1000000 --sample_for_compression=5 --secondary_cache_fault_one_in=0 --secondary_cache_uri= --skip_stats_update_on_db_open=1 --snapshot_hold_ops=100000 --soft_pending_compaction_bytes_limit=68719476736 --sqfc_name=foo --sqfc_version=1 --sst_file_manager_bytes_per_sec=0 --sst_file_manager_bytes_per_truncate=0 --stats_dump_period_sec=10 --stats_history_buffer_size=1048576 --strict_bytes_per_sync=1 --subcompactions=2 --sync=0 --sync_fault_injection=0 --table_cache_numshardbits=0 --target_file_size_base=16777216 --target_file_size_multiplier=1 --test_batches_snapshots=0 --top_level_index_pinning=3 --uncache_aggressiveness=118 --universal_max_read_amp=-1 --unpartitioned_pinning=0 --use_adaptive_mutex=0 --use_adaptive_mutex_lru=1 --use_attribute_group=0 --use_blob_cache=0 --use_delta_encoding=1 --use_direct_io_for_flush_and_compaction=0 --use_direct_reads=0 --use_full_merge_v1=0 --use_get_entity=0 --use_merge=0 --use_multi_cf_iterator=0 --use_multi_get_entity=0 --use_multiget=1 --use_put_entity_one_in=0 --use_shared_block_and_blob_cache=1 --use_sqfc_for_range_queries=1 --use_timed_put_one_in=0 --use_write_buffer_manager=0 --user_timestamp_size=0 --value_size_mult=32 --verification_only=0 --verify_checksum=1 --verify_checksum_one_in=1000000 --verify_compression=0 --verify_db_one_in=10000 --verify_file_checksums_one_in=1000000 --verify_iterator_with_expected_state_one_in=5 --verify_sst_unique_id_in_manifest=1 --wal_bytes_per_sync=0 --wal_compression=none --write_buffer_size=33554432 --write_dbid_to_manifest=0 --write_fault_one_in=0 --writepercent=35
```
Reviewed By: cbi42
Differential Revision: D60890580
Pulled By: hx235
fbshipit-source-id: 401f90d6d351c7ee11088cad06fb00e54062d416
2024-08-09 21:51:36 +00:00
|
|
|
shared->PrepareDelete(rand_column_family, rand_key);
|
2024-07-29 20:51:49 +00:00
|
|
|
|
Fix false-positive TestBackupRestore corruption (#12917)
Summary:
**Context:**
https://github.com/facebook/rocksdb/pull/12838 allows a write thread encountered certain injected error to release the lock and sleep before retrying write in order to reduce performance cost. This requires adding checks like [this](https://github.com/facebook/rocksdb/blob/b26b395e0a15255d322be08110db551976188745/db_stress_tool/expected_value.cc#L29-L31) to prevent writing to the same key from another thread.
The added check causes a false-positive failure when delete range + file ingestion + backup is used. Consider the following scenario:
(1) Issue a delete range covering some key that do not exist and a key does exist (named as k1). k1 will have "pending delete" state while the keys that does not exit will have whatever state they already have since we don't delete a key that does not exist already.
(2) After https://github.com/facebook/rocksdb/pull/12838, `PrepareDeleteRange(... &prepared)` will return `prepared = false`. So below logic will be executed and k1's "pending delete" won't get roll-backed nor committed.
```
std::vector<PendingExpectedValue> pending_expected_values =
shared->PrepareDeleteRange(rand_column_family, rand_key,
rand_key + FLAGS_range_deletion_width,
&prepared);
if (!prepared) {
for (PendingExpectedValue& pending_expected_value :
pending_expected_values) {
pending_expected_value.PermitUnclosedPendingState();
}
return s;
}
```
(3) Issue an file ingestion covering k1 and another key k2. Similar to (2), we will have `shared->PreparePut(column_family, key, &prepared)` return `prepared = false` for k1 while k2 will have a "pending put" state. So below logic will be executed and k2's "pending put" state won't get roll-backed nor committed.
```
for (int64_t key = key_base;
s.ok() && key < shared->GetMaxKey() &&
static_cast<int32_t>(keys.size()) < FLAGS_ingest_external_file_width;
++key)
PendingExpectedValue pending_expected_value =
shared->PreparePut(column_family, key, &prepared);
if (!prepared) {
pending_expected_value.PermitUnclosedPendingState();
for (PendingExpectedValue& pev : pending_expected_values) {
pev.PermitUnclosedPendingState();
}
return;
}
}
```
(4) Issue a backup and verify on k2. Below logic decides that k2 should exist in restored DB since it has a pending write state while k2 is never ingested into the original DB as (3) returns early.
```
bool Exists() const { return PendingPut() || !IsDeleted(); }
TestBackupRestore() {
...
Status get_status = restored_db->Get(
read_opts, restored_cf_handles[rand_column_families[i]], key,
&restored_value);
bool exists = thread->shared->Exists(rand_column_families[i], rand_keys[0]);
if (get_status.ok()) {
if (!exists && from_latest && ShouldAcquireMutexOnKey()) {
std::ostringstream oss;
oss << "0x" << key.ToString(true)
<< " exists in restore but not in original db";
s = Status::Corruption(oss.str());
}
} else if (get_status.IsNotFound()) {
if (exists && from_latest && ShouldAcquireMutexOnKey()) {
std::ostringstream oss;
oss << "0x" << key.ToString(true)
<< " exists in original db but not in restore";
s = Status::Corruption(oss.str());
}
}
...
}
```
So we see false-positive corruption like `Failure in a backup/restore operation with: Corruption: 0x000000000000017B0000000000000073787878 exists in original db but not in restore`
A simple fix is to remove `PendingPut()` from `bool Exists() ` since it's called under a lock and should never see a pending write. However, in order for "under a lock and should never see a pending write" to be true, we need to remove the logic of releasing the lock during sleep in the write thread, which expose pending write to other thread that can call Exists() like back up thread.
The downside of holding lock during sleep is blocking other write thread of the same key to proceed cuz they need to wait for the lock. This should happen rarely as the key of a thread is selected randomly in crash test like below.
```
void StressTest::OperateDb(ThreadState* thread) {
for (uint64_t i = 0; i < ops_per_open; i++) {
...
int64_t rand_key = GenerateOneKey(thread, i);
...
}
}
```
**Summary:**
- Removed the "lock release" part and related checks
- Printed recovery time if the write thread waited more than 10 seconds
- Reverted regression in testing coverage when deleting a non-existent key
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12917
Test Plan:
Below command repro-ed frequently before the fix and not after.
```
./db_stress --WAL_size_limit_MB=1 --WAL_ttl_seconds=60 --acquire_snapshot_one_in=0 --adaptive_readahead=0 --adm_policy=1 --advise_random_on_open=1 --allow_concurrent_memtable_write=0 --allow_data_in_errors=True --allow_fallocate=0 --allow_setting_blob_options_dynamically=1 --async_io=0 --auto_readahead_size=1 --avoid_flush_during_recovery=0 --avoid_flush_during_shutdown=0 --avoid_unnecessary_blocking_io=0 --backup_max_size=104857600 --backup_one_in=100000 --batch_protection_bytes_per_key=0 --bgerror_resume_retry_interval=100 --blob_cache_size=8388608 --blob_compaction_readahead_size=1048576 --blob_compression_type=none --blob_file_size=1073741824 --blob_file_starting_level=1 --blob_garbage_collection_age_cutoff=0.0 --blob_garbage_collection_force_threshold=0.75 --block_align=0 --block_protection_bytes_per_key=8 --block_size=16384 --bloom_before_level=2147483647 --bloom_bits=16.216959977115277 --bottommost_compression_type=xpress --bottommost_file_compaction_delay=600 --bytes_per_sync=262144 --cache_index_and_filter_blocks=1 --cache_index_and_filter_blocks_with_high_priority=1 --cache_size=8388608 --cache_type=lru_cache --charge_compression_dictionary_building_buffer=1 --charge_file_metadata=0 --charge_filter_construction=0 --charge_table_reader=1 --check_multiget_consistency=0 --check_multiget_entity_consistency=0 --checkpoint_one_in=1000000 --checksum_type=kXXH3 --clear_column_family_one_in=0 --column_families=1 --compact_files_one_in=1000 --compact_range_one_in=0 --compaction_pri=3 --compaction_readahead_size=0 --compaction_ttl=10 --compress_format_version=2 --compressed_secondary_cache_size=8388608 --compression_checksum=0 --compression_max_dict_buffer_bytes=2097151 --compression_max_dict_bytes=16384 --compression_parallel_threads=1 --compression_type=zlib --compression_use_zstd_dict_trainer=0 --compression_zstd_max_train_bytes=0 --continuous_verification_interval=0 --daily_offpeak_time_utc=04:00-08:00 --data_block_index_type=0 --db=/dev/shm/rocksdb_test/rocksdb_crashtest_whitebox --db_write_buffer_size=0 --default_temperature=kUnknown --default_write_temperature=kWarm --delete_obsolete_files_period_micros=21600000000 --delpercent=0 --delrangepercent=5 --destroy_db_initially=0 --detect_filter_construct_corruption=1 --disable_file_deletions_one_in=10000 --disable_manual_compaction_one_in=1000000 --disable_wal=0 --dump_malloc_stats=0 --enable_blob_files=0 --enable_blob_garbage_collection=1 --enable_checksum_handoff=1 --enable_compaction_filter=1 --enable_custom_split_merge=1 --enable_do_not_compress_roles=0 --enable_index_compression=1 --enable_memtable_insert_with_hint_prefix_extractor=0 --enable_pipelined_write=1 --enable_sst_partitioner_factory=1 --enable_thread_tracking=0 --enable_write_thread_adaptive_yield=0 --error_recovery_with_no_fault_injection=1 --exclude_wal_from_write_fault_injection=1 --expected_values_dir=/dev/shm/rocksdb_test/rocksdb_crashtest_expected --fail_if_options_file_error=0 --fifo_allow_compaction=1 --file_checksum_impl=big --fill_cache=1 --flush_one_in=1000000 --format_version=2 --get_all_column_family_metadata_one_in=10000 --get_current_wal_file_one_in=0 --get_live_files_apis_one_in=1000000 --get_properties_of_all_tables_one_in=100000 --get_property_one_in=100000 --get_sorted_wal_files_one_in=0 --hard_pending_compaction_bytes_limit=2097152 --high_pri_pool_ratio=0.5 --index_block_restart_interval=1 --index_shortening=2 --index_type=0 --ingest_external_file_one_in=1000 --initial_auto_readahead_size=0 --inplace_update_support=0 --iterpercent=0 --key_len_percent_dist=1,30,69 --key_may_exist_one_in=100 --last_level_temperature=kUnknown --level_compaction_dynamic_level_bytes=0 --lock_wal_one_in=10000 --log2_keys_per_lock=10 --log_file_time_to_roll=0 --log_readahead_size=0 --long_running_snapshots=1 --low_pri_pool_ratio=0.5 --lowest_used_cache_tier=1 --manifest_preallocation_size=0 --manual_wal_flush_one_in=0 --mark_for_compaction_one_file_in=10 --max_auto_readahead_size=16384 --max_background_compactions=1 --max_bytes_for_level_base=67108864 --max_key=100000 --max_key_len=3 --max_log_file_size=1048576 --max_manifest_file_size=1073741824 --max_sequential_skip_in_iterations=16 --max_total_wal_size=0 --max_write_batch_group_size_bytes=16 --max_write_buffer_number=10 --max_write_buffer_size_to_maintain=8388608 --memtable_insert_hint_per_batch=1 --memtable_max_range_deletions=1000 --memtable_prefix_bloom_size_ratio=0.001 --memtable_protection_bytes_per_key=4 --memtable_whole_key_filtering=1 --memtablerep=skip_list --metadata_charge_policy=1 --metadata_read_fault_one_in=0 --metadata_write_fault_one_in=0 --min_blob_size=16 --min_write_buffer_number_to_merge=2 --mmap_read=0 --mock_direct_io=False --nooverwritepercent=1 --num_file_reads_for_auto_readahead=0 --open_files=-1 --open_metadata_read_fault_one_in=0 --open_metadata_write_fault_one_in=0 --open_read_fault_one_in=0 --open_write_fault_one_in=0 --ops_per_thread=20000000 --optimize_filters_for_hits=1 --optimize_filters_for_memory=0 --optimize_multiget_for_io=1 --paranoid_file_checks=1 --partition_filters=0 --partition_pinning=1 --pause_background_one_in=10000 --periodic_compaction_seconds=10 --prefix_size=8 --prefixpercent=0 --prepopulate_blob_cache=1 --prepopulate_block_cache=1 --preserve_internal_time_seconds=0 --progress_reports=0 --promote_l0_one_in=0 --read_amp_bytes_per_bit=0 --read_fault_one_in=0 --readahead_size=524288 --readpercent=60 --recycle_log_file_num=1 --reopen=20 --report_bg_io_stats=0 --reset_stats_one_in=1000000 --sample_for_compression=5 --secondary_cache_fault_one_in=0 --secondary_cache_uri= --skip_stats_update_on_db_open=1 --snapshot_hold_ops=100000 --soft_pending_compaction_bytes_limit=68719476736 --sqfc_name=foo --sqfc_version=1 --sst_file_manager_bytes_per_sec=0 --sst_file_manager_bytes_per_truncate=0 --stats_dump_period_sec=10 --stats_history_buffer_size=1048576 --strict_bytes_per_sync=1 --subcompactions=2 --sync=0 --sync_fault_injection=0 --table_cache_numshardbits=0 --target_file_size_base=16777216 --target_file_size_multiplier=1 --test_batches_snapshots=0 --top_level_index_pinning=3 --uncache_aggressiveness=118 --universal_max_read_amp=-1 --unpartitioned_pinning=0 --use_adaptive_mutex=0 --use_adaptive_mutex_lru=1 --use_attribute_group=0 --use_blob_cache=0 --use_delta_encoding=1 --use_direct_io_for_flush_and_compaction=0 --use_direct_reads=0 --use_full_merge_v1=0 --use_get_entity=0 --use_merge=0 --use_multi_cf_iterator=0 --use_multi_get_entity=0 --use_multiget=1 --use_put_entity_one_in=0 --use_shared_block_and_blob_cache=1 --use_sqfc_for_range_queries=1 --use_timed_put_one_in=0 --use_write_buffer_manager=0 --user_timestamp_size=0 --value_size_mult=32 --verification_only=0 --verify_checksum=1 --verify_checksum_one_in=1000000 --verify_compression=0 --verify_db_one_in=10000 --verify_file_checksums_one_in=1000000 --verify_iterator_with_expected_state_one_in=5 --verify_sst_unique_id_in_manifest=1 --wal_bytes_per_sync=0 --wal_compression=none --write_buffer_size=33554432 --write_dbid_to_manifest=0 --write_fault_one_in=0 --writepercent=35
```
Reviewed By: cbi42
Differential Revision: D60890580
Pulled By: hx235
fbshipit-source-id: 401f90d6d351c7ee11088cad06fb00e54062d416
2024-08-09 21:51:36 +00:00
|
|
|
uint64_t wait_for_recover_start_time = 0;
|
2024-07-29 20:51:49 +00:00
|
|
|
do {
|
|
|
|
// In order to commit the expected state for the initial write failed
|
|
|
|
// with injected retryable error and successful WAL write, retry the
|
|
|
|
// write until it succeeds after the recovery finishes
|
|
|
|
if (!s.ok() && IsErrorInjectedAndRetryable(s) &&
|
|
|
|
initial_wal_write_may_succeed) {
|
|
|
|
std::this_thread::sleep_for(
|
|
|
|
std::chrono::microseconds(1 * 1000 * 1000));
|
|
|
|
}
|
|
|
|
if (!FLAGS_use_txn) {
|
|
|
|
if (FLAGS_user_timestamp_size == 0) {
|
|
|
|
s = db_->Delete(write_opts, cfh, key);
|
|
|
|
} else {
|
|
|
|
s = db_->Delete(write_opts, cfh, key, write_ts);
|
|
|
|
}
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
} else {
|
2024-07-29 20:51:49 +00:00
|
|
|
s = ExecuteTransaction(write_opts, thread, [&](Transaction& txn) {
|
|
|
|
return txn.Delete(cfh, key);
|
|
|
|
});
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
}
|
Fix false-positive TestBackupRestore corruption (#12917)
Summary:
**Context:**
https://github.com/facebook/rocksdb/pull/12838 allows a write thread encountered certain injected error to release the lock and sleep before retrying write in order to reduce performance cost. This requires adding checks like [this](https://github.com/facebook/rocksdb/blob/b26b395e0a15255d322be08110db551976188745/db_stress_tool/expected_value.cc#L29-L31) to prevent writing to the same key from another thread.
The added check causes a false-positive failure when delete range + file ingestion + backup is used. Consider the following scenario:
(1) Issue a delete range covering some key that do not exist and a key does exist (named as k1). k1 will have "pending delete" state while the keys that does not exit will have whatever state they already have since we don't delete a key that does not exist already.
(2) After https://github.com/facebook/rocksdb/pull/12838, `PrepareDeleteRange(... &prepared)` will return `prepared = false`. So below logic will be executed and k1's "pending delete" won't get roll-backed nor committed.
```
std::vector<PendingExpectedValue> pending_expected_values =
shared->PrepareDeleteRange(rand_column_family, rand_key,
rand_key + FLAGS_range_deletion_width,
&prepared);
if (!prepared) {
for (PendingExpectedValue& pending_expected_value :
pending_expected_values) {
pending_expected_value.PermitUnclosedPendingState();
}
return s;
}
```
(3) Issue an file ingestion covering k1 and another key k2. Similar to (2), we will have `shared->PreparePut(column_family, key, &prepared)` return `prepared = false` for k1 while k2 will have a "pending put" state. So below logic will be executed and k2's "pending put" state won't get roll-backed nor committed.
```
for (int64_t key = key_base;
s.ok() && key < shared->GetMaxKey() &&
static_cast<int32_t>(keys.size()) < FLAGS_ingest_external_file_width;
++key)
PendingExpectedValue pending_expected_value =
shared->PreparePut(column_family, key, &prepared);
if (!prepared) {
pending_expected_value.PermitUnclosedPendingState();
for (PendingExpectedValue& pev : pending_expected_values) {
pev.PermitUnclosedPendingState();
}
return;
}
}
```
(4) Issue a backup and verify on k2. Below logic decides that k2 should exist in restored DB since it has a pending write state while k2 is never ingested into the original DB as (3) returns early.
```
bool Exists() const { return PendingPut() || !IsDeleted(); }
TestBackupRestore() {
...
Status get_status = restored_db->Get(
read_opts, restored_cf_handles[rand_column_families[i]], key,
&restored_value);
bool exists = thread->shared->Exists(rand_column_families[i], rand_keys[0]);
if (get_status.ok()) {
if (!exists && from_latest && ShouldAcquireMutexOnKey()) {
std::ostringstream oss;
oss << "0x" << key.ToString(true)
<< " exists in restore but not in original db";
s = Status::Corruption(oss.str());
}
} else if (get_status.IsNotFound()) {
if (exists && from_latest && ShouldAcquireMutexOnKey()) {
std::ostringstream oss;
oss << "0x" << key.ToString(true)
<< " exists in original db but not in restore";
s = Status::Corruption(oss.str());
}
}
...
}
```
So we see false-positive corruption like `Failure in a backup/restore operation with: Corruption: 0x000000000000017B0000000000000073787878 exists in original db but not in restore`
A simple fix is to remove `PendingPut()` from `bool Exists() ` since it's called under a lock and should never see a pending write. However, in order for "under a lock and should never see a pending write" to be true, we need to remove the logic of releasing the lock during sleep in the write thread, which expose pending write to other thread that can call Exists() like back up thread.
The downside of holding lock during sleep is blocking other write thread of the same key to proceed cuz they need to wait for the lock. This should happen rarely as the key of a thread is selected randomly in crash test like below.
```
void StressTest::OperateDb(ThreadState* thread) {
for (uint64_t i = 0; i < ops_per_open; i++) {
...
int64_t rand_key = GenerateOneKey(thread, i);
...
}
}
```
**Summary:**
- Removed the "lock release" part and related checks
- Printed recovery time if the write thread waited more than 10 seconds
- Reverted regression in testing coverage when deleting a non-existent key
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12917
Test Plan:
Below command repro-ed frequently before the fix and not after.
```
./db_stress --WAL_size_limit_MB=1 --WAL_ttl_seconds=60 --acquire_snapshot_one_in=0 --adaptive_readahead=0 --adm_policy=1 --advise_random_on_open=1 --allow_concurrent_memtable_write=0 --allow_data_in_errors=True --allow_fallocate=0 --allow_setting_blob_options_dynamically=1 --async_io=0 --auto_readahead_size=1 --avoid_flush_during_recovery=0 --avoid_flush_during_shutdown=0 --avoid_unnecessary_blocking_io=0 --backup_max_size=104857600 --backup_one_in=100000 --batch_protection_bytes_per_key=0 --bgerror_resume_retry_interval=100 --blob_cache_size=8388608 --blob_compaction_readahead_size=1048576 --blob_compression_type=none --blob_file_size=1073741824 --blob_file_starting_level=1 --blob_garbage_collection_age_cutoff=0.0 --blob_garbage_collection_force_threshold=0.75 --block_align=0 --block_protection_bytes_per_key=8 --block_size=16384 --bloom_before_level=2147483647 --bloom_bits=16.216959977115277 --bottommost_compression_type=xpress --bottommost_file_compaction_delay=600 --bytes_per_sync=262144 --cache_index_and_filter_blocks=1 --cache_index_and_filter_blocks_with_high_priority=1 --cache_size=8388608 --cache_type=lru_cache --charge_compression_dictionary_building_buffer=1 --charge_file_metadata=0 --charge_filter_construction=0 --charge_table_reader=1 --check_multiget_consistency=0 --check_multiget_entity_consistency=0 --checkpoint_one_in=1000000 --checksum_type=kXXH3 --clear_column_family_one_in=0 --column_families=1 --compact_files_one_in=1000 --compact_range_one_in=0 --compaction_pri=3 --compaction_readahead_size=0 --compaction_ttl=10 --compress_format_version=2 --compressed_secondary_cache_size=8388608 --compression_checksum=0 --compression_max_dict_buffer_bytes=2097151 --compression_max_dict_bytes=16384 --compression_parallel_threads=1 --compression_type=zlib --compression_use_zstd_dict_trainer=0 --compression_zstd_max_train_bytes=0 --continuous_verification_interval=0 --daily_offpeak_time_utc=04:00-08:00 --data_block_index_type=0 --db=/dev/shm/rocksdb_test/rocksdb_crashtest_whitebox --db_write_buffer_size=0 --default_temperature=kUnknown --default_write_temperature=kWarm --delete_obsolete_files_period_micros=21600000000 --delpercent=0 --delrangepercent=5 --destroy_db_initially=0 --detect_filter_construct_corruption=1 --disable_file_deletions_one_in=10000 --disable_manual_compaction_one_in=1000000 --disable_wal=0 --dump_malloc_stats=0 --enable_blob_files=0 --enable_blob_garbage_collection=1 --enable_checksum_handoff=1 --enable_compaction_filter=1 --enable_custom_split_merge=1 --enable_do_not_compress_roles=0 --enable_index_compression=1 --enable_memtable_insert_with_hint_prefix_extractor=0 --enable_pipelined_write=1 --enable_sst_partitioner_factory=1 --enable_thread_tracking=0 --enable_write_thread_adaptive_yield=0 --error_recovery_with_no_fault_injection=1 --exclude_wal_from_write_fault_injection=1 --expected_values_dir=/dev/shm/rocksdb_test/rocksdb_crashtest_expected --fail_if_options_file_error=0 --fifo_allow_compaction=1 --file_checksum_impl=big --fill_cache=1 --flush_one_in=1000000 --format_version=2 --get_all_column_family_metadata_one_in=10000 --get_current_wal_file_one_in=0 --get_live_files_apis_one_in=1000000 --get_properties_of_all_tables_one_in=100000 --get_property_one_in=100000 --get_sorted_wal_files_one_in=0 --hard_pending_compaction_bytes_limit=2097152 --high_pri_pool_ratio=0.5 --index_block_restart_interval=1 --index_shortening=2 --index_type=0 --ingest_external_file_one_in=1000 --initial_auto_readahead_size=0 --inplace_update_support=0 --iterpercent=0 --key_len_percent_dist=1,30,69 --key_may_exist_one_in=100 --last_level_temperature=kUnknown --level_compaction_dynamic_level_bytes=0 --lock_wal_one_in=10000 --log2_keys_per_lock=10 --log_file_time_to_roll=0 --log_readahead_size=0 --long_running_snapshots=1 --low_pri_pool_ratio=0.5 --lowest_used_cache_tier=1 --manifest_preallocation_size=0 --manual_wal_flush_one_in=0 --mark_for_compaction_one_file_in=10 --max_auto_readahead_size=16384 --max_background_compactions=1 --max_bytes_for_level_base=67108864 --max_key=100000 --max_key_len=3 --max_log_file_size=1048576 --max_manifest_file_size=1073741824 --max_sequential_skip_in_iterations=16 --max_total_wal_size=0 --max_write_batch_group_size_bytes=16 --max_write_buffer_number=10 --max_write_buffer_size_to_maintain=8388608 --memtable_insert_hint_per_batch=1 --memtable_max_range_deletions=1000 --memtable_prefix_bloom_size_ratio=0.001 --memtable_protection_bytes_per_key=4 --memtable_whole_key_filtering=1 --memtablerep=skip_list --metadata_charge_policy=1 --metadata_read_fault_one_in=0 --metadata_write_fault_one_in=0 --min_blob_size=16 --min_write_buffer_number_to_merge=2 --mmap_read=0 --mock_direct_io=False --nooverwritepercent=1 --num_file_reads_for_auto_readahead=0 --open_files=-1 --open_metadata_read_fault_one_in=0 --open_metadata_write_fault_one_in=0 --open_read_fault_one_in=0 --open_write_fault_one_in=0 --ops_per_thread=20000000 --optimize_filters_for_hits=1 --optimize_filters_for_memory=0 --optimize_multiget_for_io=1 --paranoid_file_checks=1 --partition_filters=0 --partition_pinning=1 --pause_background_one_in=10000 --periodic_compaction_seconds=10 --prefix_size=8 --prefixpercent=0 --prepopulate_blob_cache=1 --prepopulate_block_cache=1 --preserve_internal_time_seconds=0 --progress_reports=0 --promote_l0_one_in=0 --read_amp_bytes_per_bit=0 --read_fault_one_in=0 --readahead_size=524288 --readpercent=60 --recycle_log_file_num=1 --reopen=20 --report_bg_io_stats=0 --reset_stats_one_in=1000000 --sample_for_compression=5 --secondary_cache_fault_one_in=0 --secondary_cache_uri= --skip_stats_update_on_db_open=1 --snapshot_hold_ops=100000 --soft_pending_compaction_bytes_limit=68719476736 --sqfc_name=foo --sqfc_version=1 --sst_file_manager_bytes_per_sec=0 --sst_file_manager_bytes_per_truncate=0 --stats_dump_period_sec=10 --stats_history_buffer_size=1048576 --strict_bytes_per_sync=1 --subcompactions=2 --sync=0 --sync_fault_injection=0 --table_cache_numshardbits=0 --target_file_size_base=16777216 --target_file_size_multiplier=1 --test_batches_snapshots=0 --top_level_index_pinning=3 --uncache_aggressiveness=118 --universal_max_read_amp=-1 --unpartitioned_pinning=0 --use_adaptive_mutex=0 --use_adaptive_mutex_lru=1 --use_attribute_group=0 --use_blob_cache=0 --use_delta_encoding=1 --use_direct_io_for_flush_and_compaction=0 --use_direct_reads=0 --use_full_merge_v1=0 --use_get_entity=0 --use_merge=0 --use_multi_cf_iterator=0 --use_multi_get_entity=0 --use_multiget=1 --use_put_entity_one_in=0 --use_shared_block_and_blob_cache=1 --use_sqfc_for_range_queries=1 --use_timed_put_one_in=0 --use_write_buffer_manager=0 --user_timestamp_size=0 --value_size_mult=32 --verification_only=0 --verify_checksum=1 --verify_checksum_one_in=1000000 --verify_compression=0 --verify_db_one_in=10000 --verify_file_checksums_one_in=1000000 --verify_iterator_with_expected_state_one_in=5 --verify_sst_unique_id_in_manifest=1 --wal_bytes_per_sync=0 --wal_compression=none --write_buffer_size=33554432 --write_dbid_to_manifest=0 --write_fault_one_in=0 --writepercent=35
```
Reviewed By: cbi42
Differential Revision: D60890580
Pulled By: hx235
fbshipit-source-id: 401f90d6d351c7ee11088cad06fb00e54062d416
2024-08-09 21:51:36 +00:00
|
|
|
UpdateIfInitialWriteFails(db_stress_env, s, &initial_write_s,
|
|
|
|
&initial_wal_write_may_succeed,
|
|
|
|
&wait_for_recover_start_time);
|
2024-07-29 20:51:49 +00:00
|
|
|
} while (!s.ok() && IsErrorInjectedAndRetryable(s) &&
|
|
|
|
initial_wal_write_may_succeed);
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
|
2019-12-09 07:49:32 +00:00
|
|
|
if (!s.ok()) {
|
2024-01-16 21:28:51 +00:00
|
|
|
pending_expected_value.Rollback();
|
2024-06-25 03:51:39 +00:00
|
|
|
if (IsErrorInjectedAndRetryable(s)) {
|
2024-07-29 20:51:49 +00:00
|
|
|
assert(!initial_wal_write_may_succeed);
|
Inject more errors to more files in stress test (#12713)
Summary:
**Context:**
We currently have partial error injection:
- DB operation: all read, SST write
- DB open: all read, SST write, all metadata write.
This PR completes the error injection (with some limitations below):
- DB operation & open: all read, all write, all metadata write, all metadata read
**Summary:**
- Inject retryable metadata read, metadata write error concerning directory (e.g, dir sync, ) or file metadata (e.g, name, size, file creation/deletion...)
- Inject retryable errors to all major file types: random access file, sequential file, writable file
- Allow db stress test operations to handle above injected errors gracefully without crashing
- Change all error injection to thread-local implementation for easier disabling and enabling in the same thread. For example, we can control error handling thread to have no error injection. It's also cleaner in code.
- Limitation: compared to before, we now don't have write fault injection for backup/restore CopyOrCreateFiles work threads since they use anonymous background threads as well as read injection for db open bg thread
- Add a new flag to test error recovery without error injection so we can test the path where error recovery actually succeeds
- Some Refactory & fix to db stress test framework (see PR review comments)
- Fix some minor bugs surfaced (see PR review comments)
- Limitation: had to disable backup restore with metadata read/write injection since it surfaces too many testing issues. Will add it back later to focus on surfacing actual code/internal bugs first.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12713
Test Plan:
- Existing UT
- CI with no trivial error failure
Reviewed By: pdillinger
Differential Revision: D58326608
Pulled By: hx235
fbshipit-source-id: 011b5195aaeb6011641ae0a9194f7f2a0e325ad7
2024-06-19 15:42:00 +00:00
|
|
|
return s;
|
|
|
|
} else if (FLAGS_inject_error_severity == 2) {
|
2021-07-01 21:15:49 +00:00
|
|
|
if (!is_db_stopped_ &&
|
|
|
|
s.severity() >= Status::Severity::kFatalError) {
|
|
|
|
is_db_stopped_ = true;
|
|
|
|
} else if (!is_db_stopped_ ||
|
|
|
|
s.severity() < Status::Severity::kFatalError) {
|
|
|
|
fprintf(stderr, "delete error: %s\n", s.ToString().c_str());
|
Allow TryAgain in db_stress with optimistic txn, and refactoring (#11653)
Summary:
In rare cases, optimistic transaction commit returns TryAgain. This change tolerates that intentional behavior in db_stress, up to a small limit in a row. This way, we don't miss a possible regression with excessive TryAgain, and trying again (rolling back the transaction) should have a well renewed chance of success as the writes will be associated with fresh sequence numbers.
Also, some of the APIs were not clear about Transaction semantics, so I have clarified:
* (Best I can tell....) Destroying a Transaction is safe without calling Rollback() (or at least should be). I don't know why it's a common pattern in our test code and examples to rollback before unconditional destruction. Stress test updated not to call Rollback unnecessarily (to test safe destruction).
* Despite essentially doing what is asked, simply trying Commit() again when it returns TryAgain does not have a chance of success, because of the transaction being bound to the DB state at the time of operations before Commit. Similar logic applies to Busy AFAIK. Commit() API comments updated, and expanded unit test in optimistic_transaction_test.
Also also, because I can't stop myself, I refactored a good portion of the transaction handling code in db_stress.
* Avoid existing and new copy-paste for most transaction interactions with a new ExecuteTransaction (higher-order) function.
* Use unique_ptr (nicely complements removing unnecessary Rollbacks)
* Abstract out a pattern for safely calling std::terminate() and use it in more places. (The TryAgain errors we saw did not have stack traces because of "terminate called recursively".)
Intended follow-up: resurrect use of `FLAGS_rollback_one_in` but also include non-trivial cases
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11653
Test Plan:
this is the test :)
Also, temporarily bypassed the new retry logic and boosted the chance of hitting TryAgain. Quickly reproduced the TryAgain error. Then re-enabled the new retry logic, and was not able to hit the error after running for tens of minutes, even with the boosted chances.
Reviewed By: cbi42
Differential Revision: D47882995
Pulled By: pdillinger
fbshipit-source-id: 21eadb1525423340dbf28d17cf166b9583311a0d
2023-07-28 23:25:29 +00:00
|
|
|
thread->shared->SafeTerminate();
|
2021-07-01 21:15:49 +00:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
fprintf(stderr, "delete error: %s\n", s.ToString().c_str());
|
Allow TryAgain in db_stress with optimistic txn, and refactoring (#11653)
Summary:
In rare cases, optimistic transaction commit returns TryAgain. This change tolerates that intentional behavior in db_stress, up to a small limit in a row. This way, we don't miss a possible regression with excessive TryAgain, and trying again (rolling back the transaction) should have a well renewed chance of success as the writes will be associated with fresh sequence numbers.
Also, some of the APIs were not clear about Transaction semantics, so I have clarified:
* (Best I can tell....) Destroying a Transaction is safe without calling Rollback() (or at least should be). I don't know why it's a common pattern in our test code and examples to rollback before unconditional destruction. Stress test updated not to call Rollback unnecessarily (to test safe destruction).
* Despite essentially doing what is asked, simply trying Commit() again when it returns TryAgain does not have a chance of success, because of the transaction being bound to the DB state at the time of operations before Commit. Similar logic applies to Busy AFAIK. Commit() API comments updated, and expanded unit test in optimistic_transaction_test.
Also also, because I can't stop myself, I refactored a good portion of the transaction handling code in db_stress.
* Avoid existing and new copy-paste for most transaction interactions with a new ExecuteTransaction (higher-order) function.
* Use unique_ptr (nicely complements removing unnecessary Rollbacks)
* Abstract out a pattern for safely calling std::terminate() and use it in more places. (The TryAgain errors we saw did not have stack traces because of "terminate called recursively".)
Intended follow-up: resurrect use of `FLAGS_rollback_one_in` but also include non-trivial cases
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11653
Test Plan:
this is the test :)
Also, temporarily bypassed the new retry logic and boosted the chance of hitting TryAgain. Quickly reproduced the TryAgain error. Then re-enabled the new retry logic, and was not able to hit the error after running for tens of minutes, even with the boosted chances.
Reviewed By: cbi42
Differential Revision: D47882995
Pulled By: pdillinger
fbshipit-source-id: 21eadb1525423340dbf28d17cf166b9583311a0d
2023-07-28 23:25:29 +00:00
|
|
|
thread->shared->SafeTerminate();
|
2021-07-01 21:15:49 +00:00
|
|
|
}
|
2024-07-29 20:51:49 +00:00
|
|
|
} else {
|
Fix false-positive TestBackupRestore corruption (#12917)
Summary:
**Context:**
https://github.com/facebook/rocksdb/pull/12838 allows a write thread encountered certain injected error to release the lock and sleep before retrying write in order to reduce performance cost. This requires adding checks like [this](https://github.com/facebook/rocksdb/blob/b26b395e0a15255d322be08110db551976188745/db_stress_tool/expected_value.cc#L29-L31) to prevent writing to the same key from another thread.
The added check causes a false-positive failure when delete range + file ingestion + backup is used. Consider the following scenario:
(1) Issue a delete range covering some key that do not exist and a key does exist (named as k1). k1 will have "pending delete" state while the keys that does not exit will have whatever state they already have since we don't delete a key that does not exist already.
(2) After https://github.com/facebook/rocksdb/pull/12838, `PrepareDeleteRange(... &prepared)` will return `prepared = false`. So below logic will be executed and k1's "pending delete" won't get roll-backed nor committed.
```
std::vector<PendingExpectedValue> pending_expected_values =
shared->PrepareDeleteRange(rand_column_family, rand_key,
rand_key + FLAGS_range_deletion_width,
&prepared);
if (!prepared) {
for (PendingExpectedValue& pending_expected_value :
pending_expected_values) {
pending_expected_value.PermitUnclosedPendingState();
}
return s;
}
```
(3) Issue an file ingestion covering k1 and another key k2. Similar to (2), we will have `shared->PreparePut(column_family, key, &prepared)` return `prepared = false` for k1 while k2 will have a "pending put" state. So below logic will be executed and k2's "pending put" state won't get roll-backed nor committed.
```
for (int64_t key = key_base;
s.ok() && key < shared->GetMaxKey() &&
static_cast<int32_t>(keys.size()) < FLAGS_ingest_external_file_width;
++key)
PendingExpectedValue pending_expected_value =
shared->PreparePut(column_family, key, &prepared);
if (!prepared) {
pending_expected_value.PermitUnclosedPendingState();
for (PendingExpectedValue& pev : pending_expected_values) {
pev.PermitUnclosedPendingState();
}
return;
}
}
```
(4) Issue a backup and verify on k2. Below logic decides that k2 should exist in restored DB since it has a pending write state while k2 is never ingested into the original DB as (3) returns early.
```
bool Exists() const { return PendingPut() || !IsDeleted(); }
TestBackupRestore() {
...
Status get_status = restored_db->Get(
read_opts, restored_cf_handles[rand_column_families[i]], key,
&restored_value);
bool exists = thread->shared->Exists(rand_column_families[i], rand_keys[0]);
if (get_status.ok()) {
if (!exists && from_latest && ShouldAcquireMutexOnKey()) {
std::ostringstream oss;
oss << "0x" << key.ToString(true)
<< " exists in restore but not in original db";
s = Status::Corruption(oss.str());
}
} else if (get_status.IsNotFound()) {
if (exists && from_latest && ShouldAcquireMutexOnKey()) {
std::ostringstream oss;
oss << "0x" << key.ToString(true)
<< " exists in original db but not in restore";
s = Status::Corruption(oss.str());
}
}
...
}
```
So we see false-positive corruption like `Failure in a backup/restore operation with: Corruption: 0x000000000000017B0000000000000073787878 exists in original db but not in restore`
A simple fix is to remove `PendingPut()` from `bool Exists() ` since it's called under a lock and should never see a pending write. However, in order for "under a lock and should never see a pending write" to be true, we need to remove the logic of releasing the lock during sleep in the write thread, which expose pending write to other thread that can call Exists() like back up thread.
The downside of holding lock during sleep is blocking other write thread of the same key to proceed cuz they need to wait for the lock. This should happen rarely as the key of a thread is selected randomly in crash test like below.
```
void StressTest::OperateDb(ThreadState* thread) {
for (uint64_t i = 0; i < ops_per_open; i++) {
...
int64_t rand_key = GenerateOneKey(thread, i);
...
}
}
```
**Summary:**
- Removed the "lock release" part and related checks
- Printed recovery time if the write thread waited more than 10 seconds
- Reverted regression in testing coverage when deleting a non-existent key
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12917
Test Plan:
Below command repro-ed frequently before the fix and not after.
```
./db_stress --WAL_size_limit_MB=1 --WAL_ttl_seconds=60 --acquire_snapshot_one_in=0 --adaptive_readahead=0 --adm_policy=1 --advise_random_on_open=1 --allow_concurrent_memtable_write=0 --allow_data_in_errors=True --allow_fallocate=0 --allow_setting_blob_options_dynamically=1 --async_io=0 --auto_readahead_size=1 --avoid_flush_during_recovery=0 --avoid_flush_during_shutdown=0 --avoid_unnecessary_blocking_io=0 --backup_max_size=104857600 --backup_one_in=100000 --batch_protection_bytes_per_key=0 --bgerror_resume_retry_interval=100 --blob_cache_size=8388608 --blob_compaction_readahead_size=1048576 --blob_compression_type=none --blob_file_size=1073741824 --blob_file_starting_level=1 --blob_garbage_collection_age_cutoff=0.0 --blob_garbage_collection_force_threshold=0.75 --block_align=0 --block_protection_bytes_per_key=8 --block_size=16384 --bloom_before_level=2147483647 --bloom_bits=16.216959977115277 --bottommost_compression_type=xpress --bottommost_file_compaction_delay=600 --bytes_per_sync=262144 --cache_index_and_filter_blocks=1 --cache_index_and_filter_blocks_with_high_priority=1 --cache_size=8388608 --cache_type=lru_cache --charge_compression_dictionary_building_buffer=1 --charge_file_metadata=0 --charge_filter_construction=0 --charge_table_reader=1 --check_multiget_consistency=0 --check_multiget_entity_consistency=0 --checkpoint_one_in=1000000 --checksum_type=kXXH3 --clear_column_family_one_in=0 --column_families=1 --compact_files_one_in=1000 --compact_range_one_in=0 --compaction_pri=3 --compaction_readahead_size=0 --compaction_ttl=10 --compress_format_version=2 --compressed_secondary_cache_size=8388608 --compression_checksum=0 --compression_max_dict_buffer_bytes=2097151 --compression_max_dict_bytes=16384 --compression_parallel_threads=1 --compression_type=zlib --compression_use_zstd_dict_trainer=0 --compression_zstd_max_train_bytes=0 --continuous_verification_interval=0 --daily_offpeak_time_utc=04:00-08:00 --data_block_index_type=0 --db=/dev/shm/rocksdb_test/rocksdb_crashtest_whitebox --db_write_buffer_size=0 --default_temperature=kUnknown --default_write_temperature=kWarm --delete_obsolete_files_period_micros=21600000000 --delpercent=0 --delrangepercent=5 --destroy_db_initially=0 --detect_filter_construct_corruption=1 --disable_file_deletions_one_in=10000 --disable_manual_compaction_one_in=1000000 --disable_wal=0 --dump_malloc_stats=0 --enable_blob_files=0 --enable_blob_garbage_collection=1 --enable_checksum_handoff=1 --enable_compaction_filter=1 --enable_custom_split_merge=1 --enable_do_not_compress_roles=0 --enable_index_compression=1 --enable_memtable_insert_with_hint_prefix_extractor=0 --enable_pipelined_write=1 --enable_sst_partitioner_factory=1 --enable_thread_tracking=0 --enable_write_thread_adaptive_yield=0 --error_recovery_with_no_fault_injection=1 --exclude_wal_from_write_fault_injection=1 --expected_values_dir=/dev/shm/rocksdb_test/rocksdb_crashtest_expected --fail_if_options_file_error=0 --fifo_allow_compaction=1 --file_checksum_impl=big --fill_cache=1 --flush_one_in=1000000 --format_version=2 --get_all_column_family_metadata_one_in=10000 --get_current_wal_file_one_in=0 --get_live_files_apis_one_in=1000000 --get_properties_of_all_tables_one_in=100000 --get_property_one_in=100000 --get_sorted_wal_files_one_in=0 --hard_pending_compaction_bytes_limit=2097152 --high_pri_pool_ratio=0.5 --index_block_restart_interval=1 --index_shortening=2 --index_type=0 --ingest_external_file_one_in=1000 --initial_auto_readahead_size=0 --inplace_update_support=0 --iterpercent=0 --key_len_percent_dist=1,30,69 --key_may_exist_one_in=100 --last_level_temperature=kUnknown --level_compaction_dynamic_level_bytes=0 --lock_wal_one_in=10000 --log2_keys_per_lock=10 --log_file_time_to_roll=0 --log_readahead_size=0 --long_running_snapshots=1 --low_pri_pool_ratio=0.5 --lowest_used_cache_tier=1 --manifest_preallocation_size=0 --manual_wal_flush_one_in=0 --mark_for_compaction_one_file_in=10 --max_auto_readahead_size=16384 --max_background_compactions=1 --max_bytes_for_level_base=67108864 --max_key=100000 --max_key_len=3 --max_log_file_size=1048576 --max_manifest_file_size=1073741824 --max_sequential_skip_in_iterations=16 --max_total_wal_size=0 --max_write_batch_group_size_bytes=16 --max_write_buffer_number=10 --max_write_buffer_size_to_maintain=8388608 --memtable_insert_hint_per_batch=1 --memtable_max_range_deletions=1000 --memtable_prefix_bloom_size_ratio=0.001 --memtable_protection_bytes_per_key=4 --memtable_whole_key_filtering=1 --memtablerep=skip_list --metadata_charge_policy=1 --metadata_read_fault_one_in=0 --metadata_write_fault_one_in=0 --min_blob_size=16 --min_write_buffer_number_to_merge=2 --mmap_read=0 --mock_direct_io=False --nooverwritepercent=1 --num_file_reads_for_auto_readahead=0 --open_files=-1 --open_metadata_read_fault_one_in=0 --open_metadata_write_fault_one_in=0 --open_read_fault_one_in=0 --open_write_fault_one_in=0 --ops_per_thread=20000000 --optimize_filters_for_hits=1 --optimize_filters_for_memory=0 --optimize_multiget_for_io=1 --paranoid_file_checks=1 --partition_filters=0 --partition_pinning=1 --pause_background_one_in=10000 --periodic_compaction_seconds=10 --prefix_size=8 --prefixpercent=0 --prepopulate_blob_cache=1 --prepopulate_block_cache=1 --preserve_internal_time_seconds=0 --progress_reports=0 --promote_l0_one_in=0 --read_amp_bytes_per_bit=0 --read_fault_one_in=0 --readahead_size=524288 --readpercent=60 --recycle_log_file_num=1 --reopen=20 --report_bg_io_stats=0 --reset_stats_one_in=1000000 --sample_for_compression=5 --secondary_cache_fault_one_in=0 --secondary_cache_uri= --skip_stats_update_on_db_open=1 --snapshot_hold_ops=100000 --soft_pending_compaction_bytes_limit=68719476736 --sqfc_name=foo --sqfc_version=1 --sst_file_manager_bytes_per_sec=0 --sst_file_manager_bytes_per_truncate=0 --stats_dump_period_sec=10 --stats_history_buffer_size=1048576 --strict_bytes_per_sync=1 --subcompactions=2 --sync=0 --sync_fault_injection=0 --table_cache_numshardbits=0 --target_file_size_base=16777216 --target_file_size_multiplier=1 --test_batches_snapshots=0 --top_level_index_pinning=3 --uncache_aggressiveness=118 --universal_max_read_amp=-1 --unpartitioned_pinning=0 --use_adaptive_mutex=0 --use_adaptive_mutex_lru=1 --use_attribute_group=0 --use_blob_cache=0 --use_delta_encoding=1 --use_direct_io_for_flush_and_compaction=0 --use_direct_reads=0 --use_full_merge_v1=0 --use_get_entity=0 --use_merge=0 --use_multi_cf_iterator=0 --use_multi_get_entity=0 --use_multiget=1 --use_put_entity_one_in=0 --use_shared_block_and_blob_cache=1 --use_sqfc_for_range_queries=1 --use_timed_put_one_in=0 --use_write_buffer_manager=0 --user_timestamp_size=0 --value_size_mult=32 --verification_only=0 --verify_checksum=1 --verify_checksum_one_in=1000000 --verify_compression=0 --verify_db_one_in=10000 --verify_file_checksums_one_in=1000000 --verify_iterator_with_expected_state_one_in=5 --verify_sst_unique_id_in_manifest=1 --wal_bytes_per_sync=0 --wal_compression=none --write_buffer_size=33554432 --write_dbid_to_manifest=0 --write_fault_one_in=0 --writepercent=35
```
Reviewed By: cbi42
Differential Revision: D60890580
Pulled By: hx235
fbshipit-source-id: 401f90d6d351c7ee11088cad06fb00e54062d416
2024-08-09 21:51:36 +00:00
|
|
|
PrintWriteRecoveryWaitTimeIfNeeded(
|
|
|
|
db_stress_env, initial_write_s, initial_wal_write_may_succeed,
|
|
|
|
wait_for_recover_start_time, "TestDelete");
|
2024-07-29 20:51:49 +00:00
|
|
|
pending_expected_value.Commit();
|
|
|
|
thread->stats.AddDeletes(1);
|
2019-12-09 07:49:32 +00:00
|
|
|
}
|
|
|
|
} else {
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
PendingExpectedValue pending_expected_value =
|
Fix false-positive TestBackupRestore corruption (#12917)
Summary:
**Context:**
https://github.com/facebook/rocksdb/pull/12838 allows a write thread encountered certain injected error to release the lock and sleep before retrying write in order to reduce performance cost. This requires adding checks like [this](https://github.com/facebook/rocksdb/blob/b26b395e0a15255d322be08110db551976188745/db_stress_tool/expected_value.cc#L29-L31) to prevent writing to the same key from another thread.
The added check causes a false-positive failure when delete range + file ingestion + backup is used. Consider the following scenario:
(1) Issue a delete range covering some key that do not exist and a key does exist (named as k1). k1 will have "pending delete" state while the keys that does not exit will have whatever state they already have since we don't delete a key that does not exist already.
(2) After https://github.com/facebook/rocksdb/pull/12838, `PrepareDeleteRange(... &prepared)` will return `prepared = false`. So below logic will be executed and k1's "pending delete" won't get roll-backed nor committed.
```
std::vector<PendingExpectedValue> pending_expected_values =
shared->PrepareDeleteRange(rand_column_family, rand_key,
rand_key + FLAGS_range_deletion_width,
&prepared);
if (!prepared) {
for (PendingExpectedValue& pending_expected_value :
pending_expected_values) {
pending_expected_value.PermitUnclosedPendingState();
}
return s;
}
```
(3) Issue an file ingestion covering k1 and another key k2. Similar to (2), we will have `shared->PreparePut(column_family, key, &prepared)` return `prepared = false` for k1 while k2 will have a "pending put" state. So below logic will be executed and k2's "pending put" state won't get roll-backed nor committed.
```
for (int64_t key = key_base;
s.ok() && key < shared->GetMaxKey() &&
static_cast<int32_t>(keys.size()) < FLAGS_ingest_external_file_width;
++key)
PendingExpectedValue pending_expected_value =
shared->PreparePut(column_family, key, &prepared);
if (!prepared) {
pending_expected_value.PermitUnclosedPendingState();
for (PendingExpectedValue& pev : pending_expected_values) {
pev.PermitUnclosedPendingState();
}
return;
}
}
```
(4) Issue a backup and verify on k2. Below logic decides that k2 should exist in restored DB since it has a pending write state while k2 is never ingested into the original DB as (3) returns early.
```
bool Exists() const { return PendingPut() || !IsDeleted(); }
TestBackupRestore() {
...
Status get_status = restored_db->Get(
read_opts, restored_cf_handles[rand_column_families[i]], key,
&restored_value);
bool exists = thread->shared->Exists(rand_column_families[i], rand_keys[0]);
if (get_status.ok()) {
if (!exists && from_latest && ShouldAcquireMutexOnKey()) {
std::ostringstream oss;
oss << "0x" << key.ToString(true)
<< " exists in restore but not in original db";
s = Status::Corruption(oss.str());
}
} else if (get_status.IsNotFound()) {
if (exists && from_latest && ShouldAcquireMutexOnKey()) {
std::ostringstream oss;
oss << "0x" << key.ToString(true)
<< " exists in original db but not in restore";
s = Status::Corruption(oss.str());
}
}
...
}
```
So we see false-positive corruption like `Failure in a backup/restore operation with: Corruption: 0x000000000000017B0000000000000073787878 exists in original db but not in restore`
A simple fix is to remove `PendingPut()` from `bool Exists() ` since it's called under a lock and should never see a pending write. However, in order for "under a lock and should never see a pending write" to be true, we need to remove the logic of releasing the lock during sleep in the write thread, which expose pending write to other thread that can call Exists() like back up thread.
The downside of holding lock during sleep is blocking other write thread of the same key to proceed cuz they need to wait for the lock. This should happen rarely as the key of a thread is selected randomly in crash test like below.
```
void StressTest::OperateDb(ThreadState* thread) {
for (uint64_t i = 0; i < ops_per_open; i++) {
...
int64_t rand_key = GenerateOneKey(thread, i);
...
}
}
```
**Summary:**
- Removed the "lock release" part and related checks
- Printed recovery time if the write thread waited more than 10 seconds
- Reverted regression in testing coverage when deleting a non-existent key
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12917
Test Plan:
Below command repro-ed frequently before the fix and not after.
```
./db_stress --WAL_size_limit_MB=1 --WAL_ttl_seconds=60 --acquire_snapshot_one_in=0 --adaptive_readahead=0 --adm_policy=1 --advise_random_on_open=1 --allow_concurrent_memtable_write=0 --allow_data_in_errors=True --allow_fallocate=0 --allow_setting_blob_options_dynamically=1 --async_io=0 --auto_readahead_size=1 --avoid_flush_during_recovery=0 --avoid_flush_during_shutdown=0 --avoid_unnecessary_blocking_io=0 --backup_max_size=104857600 --backup_one_in=100000 --batch_protection_bytes_per_key=0 --bgerror_resume_retry_interval=100 --blob_cache_size=8388608 --blob_compaction_readahead_size=1048576 --blob_compression_type=none --blob_file_size=1073741824 --blob_file_starting_level=1 --blob_garbage_collection_age_cutoff=0.0 --blob_garbage_collection_force_threshold=0.75 --block_align=0 --block_protection_bytes_per_key=8 --block_size=16384 --bloom_before_level=2147483647 --bloom_bits=16.216959977115277 --bottommost_compression_type=xpress --bottommost_file_compaction_delay=600 --bytes_per_sync=262144 --cache_index_and_filter_blocks=1 --cache_index_and_filter_blocks_with_high_priority=1 --cache_size=8388608 --cache_type=lru_cache --charge_compression_dictionary_building_buffer=1 --charge_file_metadata=0 --charge_filter_construction=0 --charge_table_reader=1 --check_multiget_consistency=0 --check_multiget_entity_consistency=0 --checkpoint_one_in=1000000 --checksum_type=kXXH3 --clear_column_family_one_in=0 --column_families=1 --compact_files_one_in=1000 --compact_range_one_in=0 --compaction_pri=3 --compaction_readahead_size=0 --compaction_ttl=10 --compress_format_version=2 --compressed_secondary_cache_size=8388608 --compression_checksum=0 --compression_max_dict_buffer_bytes=2097151 --compression_max_dict_bytes=16384 --compression_parallel_threads=1 --compression_type=zlib --compression_use_zstd_dict_trainer=0 --compression_zstd_max_train_bytes=0 --continuous_verification_interval=0 --daily_offpeak_time_utc=04:00-08:00 --data_block_index_type=0 --db=/dev/shm/rocksdb_test/rocksdb_crashtest_whitebox --db_write_buffer_size=0 --default_temperature=kUnknown --default_write_temperature=kWarm --delete_obsolete_files_period_micros=21600000000 --delpercent=0 --delrangepercent=5 --destroy_db_initially=0 --detect_filter_construct_corruption=1 --disable_file_deletions_one_in=10000 --disable_manual_compaction_one_in=1000000 --disable_wal=0 --dump_malloc_stats=0 --enable_blob_files=0 --enable_blob_garbage_collection=1 --enable_checksum_handoff=1 --enable_compaction_filter=1 --enable_custom_split_merge=1 --enable_do_not_compress_roles=0 --enable_index_compression=1 --enable_memtable_insert_with_hint_prefix_extractor=0 --enable_pipelined_write=1 --enable_sst_partitioner_factory=1 --enable_thread_tracking=0 --enable_write_thread_adaptive_yield=0 --error_recovery_with_no_fault_injection=1 --exclude_wal_from_write_fault_injection=1 --expected_values_dir=/dev/shm/rocksdb_test/rocksdb_crashtest_expected --fail_if_options_file_error=0 --fifo_allow_compaction=1 --file_checksum_impl=big --fill_cache=1 --flush_one_in=1000000 --format_version=2 --get_all_column_family_metadata_one_in=10000 --get_current_wal_file_one_in=0 --get_live_files_apis_one_in=1000000 --get_properties_of_all_tables_one_in=100000 --get_property_one_in=100000 --get_sorted_wal_files_one_in=0 --hard_pending_compaction_bytes_limit=2097152 --high_pri_pool_ratio=0.5 --index_block_restart_interval=1 --index_shortening=2 --index_type=0 --ingest_external_file_one_in=1000 --initial_auto_readahead_size=0 --inplace_update_support=0 --iterpercent=0 --key_len_percent_dist=1,30,69 --key_may_exist_one_in=100 --last_level_temperature=kUnknown --level_compaction_dynamic_level_bytes=0 --lock_wal_one_in=10000 --log2_keys_per_lock=10 --log_file_time_to_roll=0 --log_readahead_size=0 --long_running_snapshots=1 --low_pri_pool_ratio=0.5 --lowest_used_cache_tier=1 --manifest_preallocation_size=0 --manual_wal_flush_one_in=0 --mark_for_compaction_one_file_in=10 --max_auto_readahead_size=16384 --max_background_compactions=1 --max_bytes_for_level_base=67108864 --max_key=100000 --max_key_len=3 --max_log_file_size=1048576 --max_manifest_file_size=1073741824 --max_sequential_skip_in_iterations=16 --max_total_wal_size=0 --max_write_batch_group_size_bytes=16 --max_write_buffer_number=10 --max_write_buffer_size_to_maintain=8388608 --memtable_insert_hint_per_batch=1 --memtable_max_range_deletions=1000 --memtable_prefix_bloom_size_ratio=0.001 --memtable_protection_bytes_per_key=4 --memtable_whole_key_filtering=1 --memtablerep=skip_list --metadata_charge_policy=1 --metadata_read_fault_one_in=0 --metadata_write_fault_one_in=0 --min_blob_size=16 --min_write_buffer_number_to_merge=2 --mmap_read=0 --mock_direct_io=False --nooverwritepercent=1 --num_file_reads_for_auto_readahead=0 --open_files=-1 --open_metadata_read_fault_one_in=0 --open_metadata_write_fault_one_in=0 --open_read_fault_one_in=0 --open_write_fault_one_in=0 --ops_per_thread=20000000 --optimize_filters_for_hits=1 --optimize_filters_for_memory=0 --optimize_multiget_for_io=1 --paranoid_file_checks=1 --partition_filters=0 --partition_pinning=1 --pause_background_one_in=10000 --periodic_compaction_seconds=10 --prefix_size=8 --prefixpercent=0 --prepopulate_blob_cache=1 --prepopulate_block_cache=1 --preserve_internal_time_seconds=0 --progress_reports=0 --promote_l0_one_in=0 --read_amp_bytes_per_bit=0 --read_fault_one_in=0 --readahead_size=524288 --readpercent=60 --recycle_log_file_num=1 --reopen=20 --report_bg_io_stats=0 --reset_stats_one_in=1000000 --sample_for_compression=5 --secondary_cache_fault_one_in=0 --secondary_cache_uri= --skip_stats_update_on_db_open=1 --snapshot_hold_ops=100000 --soft_pending_compaction_bytes_limit=68719476736 --sqfc_name=foo --sqfc_version=1 --sst_file_manager_bytes_per_sec=0 --sst_file_manager_bytes_per_truncate=0 --stats_dump_period_sec=10 --stats_history_buffer_size=1048576 --strict_bytes_per_sync=1 --subcompactions=2 --sync=0 --sync_fault_injection=0 --table_cache_numshardbits=0 --target_file_size_base=16777216 --target_file_size_multiplier=1 --test_batches_snapshots=0 --top_level_index_pinning=3 --uncache_aggressiveness=118 --universal_max_read_amp=-1 --unpartitioned_pinning=0 --use_adaptive_mutex=0 --use_adaptive_mutex_lru=1 --use_attribute_group=0 --use_blob_cache=0 --use_delta_encoding=1 --use_direct_io_for_flush_and_compaction=0 --use_direct_reads=0 --use_full_merge_v1=0 --use_get_entity=0 --use_merge=0 --use_multi_cf_iterator=0 --use_multi_get_entity=0 --use_multiget=1 --use_put_entity_one_in=0 --use_shared_block_and_blob_cache=1 --use_sqfc_for_range_queries=1 --use_timed_put_one_in=0 --use_write_buffer_manager=0 --user_timestamp_size=0 --value_size_mult=32 --verification_only=0 --verify_checksum=1 --verify_checksum_one_in=1000000 --verify_compression=0 --verify_db_one_in=10000 --verify_file_checksums_one_in=1000000 --verify_iterator_with_expected_state_one_in=5 --verify_sst_unique_id_in_manifest=1 --wal_bytes_per_sync=0 --wal_compression=none --write_buffer_size=33554432 --write_dbid_to_manifest=0 --write_fault_one_in=0 --writepercent=35
```
Reviewed By: cbi42
Differential Revision: D60890580
Pulled By: hx235
fbshipit-source-id: 401f90d6d351c7ee11088cad06fb00e54062d416
2024-08-09 21:51:36 +00:00
|
|
|
shared->PrepareSingleDelete(rand_column_family, rand_key);
|
2024-07-29 20:51:49 +00:00
|
|
|
|
Fix false-positive TestBackupRestore corruption (#12917)
Summary:
**Context:**
https://github.com/facebook/rocksdb/pull/12838 allows a write thread encountered certain injected error to release the lock and sleep before retrying write in order to reduce performance cost. This requires adding checks like [this](https://github.com/facebook/rocksdb/blob/b26b395e0a15255d322be08110db551976188745/db_stress_tool/expected_value.cc#L29-L31) to prevent writing to the same key from another thread.
The added check causes a false-positive failure when delete range + file ingestion + backup is used. Consider the following scenario:
(1) Issue a delete range covering some key that do not exist and a key does exist (named as k1). k1 will have "pending delete" state while the keys that does not exit will have whatever state they already have since we don't delete a key that does not exist already.
(2) After https://github.com/facebook/rocksdb/pull/12838, `PrepareDeleteRange(... &prepared)` will return `prepared = false`. So below logic will be executed and k1's "pending delete" won't get roll-backed nor committed.
```
std::vector<PendingExpectedValue> pending_expected_values =
shared->PrepareDeleteRange(rand_column_family, rand_key,
rand_key + FLAGS_range_deletion_width,
&prepared);
if (!prepared) {
for (PendingExpectedValue& pending_expected_value :
pending_expected_values) {
pending_expected_value.PermitUnclosedPendingState();
}
return s;
}
```
(3) Issue an file ingestion covering k1 and another key k2. Similar to (2), we will have `shared->PreparePut(column_family, key, &prepared)` return `prepared = false` for k1 while k2 will have a "pending put" state. So below logic will be executed and k2's "pending put" state won't get roll-backed nor committed.
```
for (int64_t key = key_base;
s.ok() && key < shared->GetMaxKey() &&
static_cast<int32_t>(keys.size()) < FLAGS_ingest_external_file_width;
++key)
PendingExpectedValue pending_expected_value =
shared->PreparePut(column_family, key, &prepared);
if (!prepared) {
pending_expected_value.PermitUnclosedPendingState();
for (PendingExpectedValue& pev : pending_expected_values) {
pev.PermitUnclosedPendingState();
}
return;
}
}
```
(4) Issue a backup and verify on k2. Below logic decides that k2 should exist in restored DB since it has a pending write state while k2 is never ingested into the original DB as (3) returns early.
```
bool Exists() const { return PendingPut() || !IsDeleted(); }
TestBackupRestore() {
...
Status get_status = restored_db->Get(
read_opts, restored_cf_handles[rand_column_families[i]], key,
&restored_value);
bool exists = thread->shared->Exists(rand_column_families[i], rand_keys[0]);
if (get_status.ok()) {
if (!exists && from_latest && ShouldAcquireMutexOnKey()) {
std::ostringstream oss;
oss << "0x" << key.ToString(true)
<< " exists in restore but not in original db";
s = Status::Corruption(oss.str());
}
} else if (get_status.IsNotFound()) {
if (exists && from_latest && ShouldAcquireMutexOnKey()) {
std::ostringstream oss;
oss << "0x" << key.ToString(true)
<< " exists in original db but not in restore";
s = Status::Corruption(oss.str());
}
}
...
}
```
So we see false-positive corruption like `Failure in a backup/restore operation with: Corruption: 0x000000000000017B0000000000000073787878 exists in original db but not in restore`
A simple fix is to remove `PendingPut()` from `bool Exists() ` since it's called under a lock and should never see a pending write. However, in order for "under a lock and should never see a pending write" to be true, we need to remove the logic of releasing the lock during sleep in the write thread, which expose pending write to other thread that can call Exists() like back up thread.
The downside of holding lock during sleep is blocking other write thread of the same key to proceed cuz they need to wait for the lock. This should happen rarely as the key of a thread is selected randomly in crash test like below.
```
void StressTest::OperateDb(ThreadState* thread) {
for (uint64_t i = 0; i < ops_per_open; i++) {
...
int64_t rand_key = GenerateOneKey(thread, i);
...
}
}
```
**Summary:**
- Removed the "lock release" part and related checks
- Printed recovery time if the write thread waited more than 10 seconds
- Reverted regression in testing coverage when deleting a non-existent key
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12917
Test Plan:
Below command repro-ed frequently before the fix and not after.
```
./db_stress --WAL_size_limit_MB=1 --WAL_ttl_seconds=60 --acquire_snapshot_one_in=0 --adaptive_readahead=0 --adm_policy=1 --advise_random_on_open=1 --allow_concurrent_memtable_write=0 --allow_data_in_errors=True --allow_fallocate=0 --allow_setting_blob_options_dynamically=1 --async_io=0 --auto_readahead_size=1 --avoid_flush_during_recovery=0 --avoid_flush_during_shutdown=0 --avoid_unnecessary_blocking_io=0 --backup_max_size=104857600 --backup_one_in=100000 --batch_protection_bytes_per_key=0 --bgerror_resume_retry_interval=100 --blob_cache_size=8388608 --blob_compaction_readahead_size=1048576 --blob_compression_type=none --blob_file_size=1073741824 --blob_file_starting_level=1 --blob_garbage_collection_age_cutoff=0.0 --blob_garbage_collection_force_threshold=0.75 --block_align=0 --block_protection_bytes_per_key=8 --block_size=16384 --bloom_before_level=2147483647 --bloom_bits=16.216959977115277 --bottommost_compression_type=xpress --bottommost_file_compaction_delay=600 --bytes_per_sync=262144 --cache_index_and_filter_blocks=1 --cache_index_and_filter_blocks_with_high_priority=1 --cache_size=8388608 --cache_type=lru_cache --charge_compression_dictionary_building_buffer=1 --charge_file_metadata=0 --charge_filter_construction=0 --charge_table_reader=1 --check_multiget_consistency=0 --check_multiget_entity_consistency=0 --checkpoint_one_in=1000000 --checksum_type=kXXH3 --clear_column_family_one_in=0 --column_families=1 --compact_files_one_in=1000 --compact_range_one_in=0 --compaction_pri=3 --compaction_readahead_size=0 --compaction_ttl=10 --compress_format_version=2 --compressed_secondary_cache_size=8388608 --compression_checksum=0 --compression_max_dict_buffer_bytes=2097151 --compression_max_dict_bytes=16384 --compression_parallel_threads=1 --compression_type=zlib --compression_use_zstd_dict_trainer=0 --compression_zstd_max_train_bytes=0 --continuous_verification_interval=0 --daily_offpeak_time_utc=04:00-08:00 --data_block_index_type=0 --db=/dev/shm/rocksdb_test/rocksdb_crashtest_whitebox --db_write_buffer_size=0 --default_temperature=kUnknown --default_write_temperature=kWarm --delete_obsolete_files_period_micros=21600000000 --delpercent=0 --delrangepercent=5 --destroy_db_initially=0 --detect_filter_construct_corruption=1 --disable_file_deletions_one_in=10000 --disable_manual_compaction_one_in=1000000 --disable_wal=0 --dump_malloc_stats=0 --enable_blob_files=0 --enable_blob_garbage_collection=1 --enable_checksum_handoff=1 --enable_compaction_filter=1 --enable_custom_split_merge=1 --enable_do_not_compress_roles=0 --enable_index_compression=1 --enable_memtable_insert_with_hint_prefix_extractor=0 --enable_pipelined_write=1 --enable_sst_partitioner_factory=1 --enable_thread_tracking=0 --enable_write_thread_adaptive_yield=0 --error_recovery_with_no_fault_injection=1 --exclude_wal_from_write_fault_injection=1 --expected_values_dir=/dev/shm/rocksdb_test/rocksdb_crashtest_expected --fail_if_options_file_error=0 --fifo_allow_compaction=1 --file_checksum_impl=big --fill_cache=1 --flush_one_in=1000000 --format_version=2 --get_all_column_family_metadata_one_in=10000 --get_current_wal_file_one_in=0 --get_live_files_apis_one_in=1000000 --get_properties_of_all_tables_one_in=100000 --get_property_one_in=100000 --get_sorted_wal_files_one_in=0 --hard_pending_compaction_bytes_limit=2097152 --high_pri_pool_ratio=0.5 --index_block_restart_interval=1 --index_shortening=2 --index_type=0 --ingest_external_file_one_in=1000 --initial_auto_readahead_size=0 --inplace_update_support=0 --iterpercent=0 --key_len_percent_dist=1,30,69 --key_may_exist_one_in=100 --last_level_temperature=kUnknown --level_compaction_dynamic_level_bytes=0 --lock_wal_one_in=10000 --log2_keys_per_lock=10 --log_file_time_to_roll=0 --log_readahead_size=0 --long_running_snapshots=1 --low_pri_pool_ratio=0.5 --lowest_used_cache_tier=1 --manifest_preallocation_size=0 --manual_wal_flush_one_in=0 --mark_for_compaction_one_file_in=10 --max_auto_readahead_size=16384 --max_background_compactions=1 --max_bytes_for_level_base=67108864 --max_key=100000 --max_key_len=3 --max_log_file_size=1048576 --max_manifest_file_size=1073741824 --max_sequential_skip_in_iterations=16 --max_total_wal_size=0 --max_write_batch_group_size_bytes=16 --max_write_buffer_number=10 --max_write_buffer_size_to_maintain=8388608 --memtable_insert_hint_per_batch=1 --memtable_max_range_deletions=1000 --memtable_prefix_bloom_size_ratio=0.001 --memtable_protection_bytes_per_key=4 --memtable_whole_key_filtering=1 --memtablerep=skip_list --metadata_charge_policy=1 --metadata_read_fault_one_in=0 --metadata_write_fault_one_in=0 --min_blob_size=16 --min_write_buffer_number_to_merge=2 --mmap_read=0 --mock_direct_io=False --nooverwritepercent=1 --num_file_reads_for_auto_readahead=0 --open_files=-1 --open_metadata_read_fault_one_in=0 --open_metadata_write_fault_one_in=0 --open_read_fault_one_in=0 --open_write_fault_one_in=0 --ops_per_thread=20000000 --optimize_filters_for_hits=1 --optimize_filters_for_memory=0 --optimize_multiget_for_io=1 --paranoid_file_checks=1 --partition_filters=0 --partition_pinning=1 --pause_background_one_in=10000 --periodic_compaction_seconds=10 --prefix_size=8 --prefixpercent=0 --prepopulate_blob_cache=1 --prepopulate_block_cache=1 --preserve_internal_time_seconds=0 --progress_reports=0 --promote_l0_one_in=0 --read_amp_bytes_per_bit=0 --read_fault_one_in=0 --readahead_size=524288 --readpercent=60 --recycle_log_file_num=1 --reopen=20 --report_bg_io_stats=0 --reset_stats_one_in=1000000 --sample_for_compression=5 --secondary_cache_fault_one_in=0 --secondary_cache_uri= --skip_stats_update_on_db_open=1 --snapshot_hold_ops=100000 --soft_pending_compaction_bytes_limit=68719476736 --sqfc_name=foo --sqfc_version=1 --sst_file_manager_bytes_per_sec=0 --sst_file_manager_bytes_per_truncate=0 --stats_dump_period_sec=10 --stats_history_buffer_size=1048576 --strict_bytes_per_sync=1 --subcompactions=2 --sync=0 --sync_fault_injection=0 --table_cache_numshardbits=0 --target_file_size_base=16777216 --target_file_size_multiplier=1 --test_batches_snapshots=0 --top_level_index_pinning=3 --uncache_aggressiveness=118 --universal_max_read_amp=-1 --unpartitioned_pinning=0 --use_adaptive_mutex=0 --use_adaptive_mutex_lru=1 --use_attribute_group=0 --use_blob_cache=0 --use_delta_encoding=1 --use_direct_io_for_flush_and_compaction=0 --use_direct_reads=0 --use_full_merge_v1=0 --use_get_entity=0 --use_merge=0 --use_multi_cf_iterator=0 --use_multi_get_entity=0 --use_multiget=1 --use_put_entity_one_in=0 --use_shared_block_and_blob_cache=1 --use_sqfc_for_range_queries=1 --use_timed_put_one_in=0 --use_write_buffer_manager=0 --user_timestamp_size=0 --value_size_mult=32 --verification_only=0 --verify_checksum=1 --verify_checksum_one_in=1000000 --verify_compression=0 --verify_db_one_in=10000 --verify_file_checksums_one_in=1000000 --verify_iterator_with_expected_state_one_in=5 --verify_sst_unique_id_in_manifest=1 --wal_bytes_per_sync=0 --wal_compression=none --write_buffer_size=33554432 --write_dbid_to_manifest=0 --write_fault_one_in=0 --writepercent=35
```
Reviewed By: cbi42
Differential Revision: D60890580
Pulled By: hx235
fbshipit-source-id: 401f90d6d351c7ee11088cad06fb00e54062d416
2024-08-09 21:51:36 +00:00
|
|
|
uint64_t wait_for_recover_start_time = 0;
|
2024-07-29 20:51:49 +00:00
|
|
|
do {
|
|
|
|
// In order to commit the expected state for the initial write failed
|
|
|
|
// with injected retryable error and successful WAL write, retry the
|
|
|
|
// write until it succeeds after the recovery finishes
|
|
|
|
if (!s.ok() && IsErrorInjectedAndRetryable(s) &&
|
|
|
|
initial_wal_write_may_succeed) {
|
|
|
|
std::this_thread::sleep_for(
|
|
|
|
std::chrono::microseconds(1 * 1000 * 1000));
|
|
|
|
}
|
|
|
|
if (!FLAGS_use_txn) {
|
|
|
|
if (FLAGS_user_timestamp_size == 0) {
|
|
|
|
s = db_->SingleDelete(write_opts, cfh, key);
|
|
|
|
} else {
|
|
|
|
s = db_->SingleDelete(write_opts, cfh, key, write_ts);
|
|
|
|
}
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
} else {
|
2024-07-29 20:51:49 +00:00
|
|
|
s = ExecuteTransaction(write_opts, thread, [&](Transaction& txn) {
|
|
|
|
return txn.SingleDelete(cfh, key);
|
|
|
|
});
|
Revise APIs related to user-defined timestamp (#8946)
Summary:
ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`.
Namely, `WriteOptions` should not include information about "what-to-write", but should just
include information about "how-to-write".
According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore,
this PR removes `WriteOptions::timestamp` for compliance.
After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set
of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and
`SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity
made me reconsider doing it in another PR (maybe).
For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take
extra `timestamp` information when writing to `WriteBatch`es.
These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list.
Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to
`WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps
allocated already and multiple timestamps can be updated.
The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp
size of the default column family. This will be used to allocate space when calling APIs that do not
specify a column family handle.
Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing
some assertions about timestamp to returning Status code.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946
Test Plan:
make check
./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8
./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0
Make sure there is no perf regression by running the following
```
./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom
```
Before this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s
```
After this PR
```
DB path: [/dev/shm/rocksdb]
fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s
```
Reviewed By: ltamasi
Differential Revision: D33721359
Pulled By: riversand963
fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
2022-02-02 06:17:46 +00:00
|
|
|
}
|
Fix false-positive TestBackupRestore corruption (#12917)
Summary:
**Context:**
https://github.com/facebook/rocksdb/pull/12838 allows a write thread encountered certain injected error to release the lock and sleep before retrying write in order to reduce performance cost. This requires adding checks like [this](https://github.com/facebook/rocksdb/blob/b26b395e0a15255d322be08110db551976188745/db_stress_tool/expected_value.cc#L29-L31) to prevent writing to the same key from another thread.
The added check causes a false-positive failure when delete range + file ingestion + backup is used. Consider the following scenario:
(1) Issue a delete range covering some key that do not exist and a key does exist (named as k1). k1 will have "pending delete" state while the keys that does not exit will have whatever state they already have since we don't delete a key that does not exist already.
(2) After https://github.com/facebook/rocksdb/pull/12838, `PrepareDeleteRange(... &prepared)` will return `prepared = false`. So below logic will be executed and k1's "pending delete" won't get roll-backed nor committed.
```
std::vector<PendingExpectedValue> pending_expected_values =
shared->PrepareDeleteRange(rand_column_family, rand_key,
rand_key + FLAGS_range_deletion_width,
&prepared);
if (!prepared) {
for (PendingExpectedValue& pending_expected_value :
pending_expected_values) {
pending_expected_value.PermitUnclosedPendingState();
}
return s;
}
```
(3) Issue an file ingestion covering k1 and another key k2. Similar to (2), we will have `shared->PreparePut(column_family, key, &prepared)` return `prepared = false` for k1 while k2 will have a "pending put" state. So below logic will be executed and k2's "pending put" state won't get roll-backed nor committed.
```
for (int64_t key = key_base;
s.ok() && key < shared->GetMaxKey() &&
static_cast<int32_t>(keys.size()) < FLAGS_ingest_external_file_width;
++key)
PendingExpectedValue pending_expected_value =
shared->PreparePut(column_family, key, &prepared);
if (!prepared) {
pending_expected_value.PermitUnclosedPendingState();
for (PendingExpectedValue& pev : pending_expected_values) {
pev.PermitUnclosedPendingState();
}
return;
}
}
```
(4) Issue a backup and verify on k2. Below logic decides that k2 should exist in restored DB since it has a pending write state while k2 is never ingested into the original DB as (3) returns early.
```
bool Exists() const { return PendingPut() || !IsDeleted(); }
TestBackupRestore() {
...
Status get_status = restored_db->Get(
read_opts, restored_cf_handles[rand_column_families[i]], key,
&restored_value);
bool exists = thread->shared->Exists(rand_column_families[i], rand_keys[0]);
if (get_status.ok()) {
if (!exists && from_latest && ShouldAcquireMutexOnKey()) {
std::ostringstream oss;
oss << "0x" << key.ToString(true)
<< " exists in restore but not in original db";
s = Status::Corruption(oss.str());
}
} else if (get_status.IsNotFound()) {
if (exists && from_latest && ShouldAcquireMutexOnKey()) {
std::ostringstream oss;
oss << "0x" << key.ToString(true)
<< " exists in original db but not in restore";
s = Status::Corruption(oss.str());
}
}
...
}
```
So we see false-positive corruption like `Failure in a backup/restore operation with: Corruption: 0x000000000000017B0000000000000073787878 exists in original db but not in restore`
A simple fix is to remove `PendingPut()` from `bool Exists() ` since it's called under a lock and should never see a pending write. However, in order for "under a lock and should never see a pending write" to be true, we need to remove the logic of releasing the lock during sleep in the write thread, which expose pending write to other thread that can call Exists() like back up thread.
The downside of holding lock during sleep is blocking other write thread of the same key to proceed cuz they need to wait for the lock. This should happen rarely as the key of a thread is selected randomly in crash test like below.
```
void StressTest::OperateDb(ThreadState* thread) {
for (uint64_t i = 0; i < ops_per_open; i++) {
...
int64_t rand_key = GenerateOneKey(thread, i);
...
}
}
```
**Summary:**
- Removed the "lock release" part and related checks
- Printed recovery time if the write thread waited more than 10 seconds
- Reverted regression in testing coverage when deleting a non-existent key
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12917
Test Plan:
Below command repro-ed frequently before the fix and not after.
```
./db_stress --WAL_size_limit_MB=1 --WAL_ttl_seconds=60 --acquire_snapshot_one_in=0 --adaptive_readahead=0 --adm_policy=1 --advise_random_on_open=1 --allow_concurrent_memtable_write=0 --allow_data_in_errors=True --allow_fallocate=0 --allow_setting_blob_options_dynamically=1 --async_io=0 --auto_readahead_size=1 --avoid_flush_during_recovery=0 --avoid_flush_during_shutdown=0 --avoid_unnecessary_blocking_io=0 --backup_max_size=104857600 --backup_one_in=100000 --batch_protection_bytes_per_key=0 --bgerror_resume_retry_interval=100 --blob_cache_size=8388608 --blob_compaction_readahead_size=1048576 --blob_compression_type=none --blob_file_size=1073741824 --blob_file_starting_level=1 --blob_garbage_collection_age_cutoff=0.0 --blob_garbage_collection_force_threshold=0.75 --block_align=0 --block_protection_bytes_per_key=8 --block_size=16384 --bloom_before_level=2147483647 --bloom_bits=16.216959977115277 --bottommost_compression_type=xpress --bottommost_file_compaction_delay=600 --bytes_per_sync=262144 --cache_index_and_filter_blocks=1 --cache_index_and_filter_blocks_with_high_priority=1 --cache_size=8388608 --cache_type=lru_cache --charge_compression_dictionary_building_buffer=1 --charge_file_metadata=0 --charge_filter_construction=0 --charge_table_reader=1 --check_multiget_consistency=0 --check_multiget_entity_consistency=0 --checkpoint_one_in=1000000 --checksum_type=kXXH3 --clear_column_family_one_in=0 --column_families=1 --compact_files_one_in=1000 --compact_range_one_in=0 --compaction_pri=3 --compaction_readahead_size=0 --compaction_ttl=10 --compress_format_version=2 --compressed_secondary_cache_size=8388608 --compression_checksum=0 --compression_max_dict_buffer_bytes=2097151 --compression_max_dict_bytes=16384 --compression_parallel_threads=1 --compression_type=zlib --compression_use_zstd_dict_trainer=0 --compression_zstd_max_train_bytes=0 --continuous_verification_interval=0 --daily_offpeak_time_utc=04:00-08:00 --data_block_index_type=0 --db=/dev/shm/rocksdb_test/rocksdb_crashtest_whitebox --db_write_buffer_size=0 --default_temperature=kUnknown --default_write_temperature=kWarm --delete_obsolete_files_period_micros=21600000000 --delpercent=0 --delrangepercent=5 --destroy_db_initially=0 --detect_filter_construct_corruption=1 --disable_file_deletions_one_in=10000 --disable_manual_compaction_one_in=1000000 --disable_wal=0 --dump_malloc_stats=0 --enable_blob_files=0 --enable_blob_garbage_collection=1 --enable_checksum_handoff=1 --enable_compaction_filter=1 --enable_custom_split_merge=1 --enable_do_not_compress_roles=0 --enable_index_compression=1 --enable_memtable_insert_with_hint_prefix_extractor=0 --enable_pipelined_write=1 --enable_sst_partitioner_factory=1 --enable_thread_tracking=0 --enable_write_thread_adaptive_yield=0 --error_recovery_with_no_fault_injection=1 --exclude_wal_from_write_fault_injection=1 --expected_values_dir=/dev/shm/rocksdb_test/rocksdb_crashtest_expected --fail_if_options_file_error=0 --fifo_allow_compaction=1 --file_checksum_impl=big --fill_cache=1 --flush_one_in=1000000 --format_version=2 --get_all_column_family_metadata_one_in=10000 --get_current_wal_file_one_in=0 --get_live_files_apis_one_in=1000000 --get_properties_of_all_tables_one_in=100000 --get_property_one_in=100000 --get_sorted_wal_files_one_in=0 --hard_pending_compaction_bytes_limit=2097152 --high_pri_pool_ratio=0.5 --index_block_restart_interval=1 --index_shortening=2 --index_type=0 --ingest_external_file_one_in=1000 --initial_auto_readahead_size=0 --inplace_update_support=0 --iterpercent=0 --key_len_percent_dist=1,30,69 --key_may_exist_one_in=100 --last_level_temperature=kUnknown --level_compaction_dynamic_level_bytes=0 --lock_wal_one_in=10000 --log2_keys_per_lock=10 --log_file_time_to_roll=0 --log_readahead_size=0 --long_running_snapshots=1 --low_pri_pool_ratio=0.5 --lowest_used_cache_tier=1 --manifest_preallocation_size=0 --manual_wal_flush_one_in=0 --mark_for_compaction_one_file_in=10 --max_auto_readahead_size=16384 --max_background_compactions=1 --max_bytes_for_level_base=67108864 --max_key=100000 --max_key_len=3 --max_log_file_size=1048576 --max_manifest_file_size=1073741824 --max_sequential_skip_in_iterations=16 --max_total_wal_size=0 --max_write_batch_group_size_bytes=16 --max_write_buffer_number=10 --max_write_buffer_size_to_maintain=8388608 --memtable_insert_hint_per_batch=1 --memtable_max_range_deletions=1000 --memtable_prefix_bloom_size_ratio=0.001 --memtable_protection_bytes_per_key=4 --memtable_whole_key_filtering=1 --memtablerep=skip_list --metadata_charge_policy=1 --metadata_read_fault_one_in=0 --metadata_write_fault_one_in=0 --min_blob_size=16 --min_write_buffer_number_to_merge=2 --mmap_read=0 --mock_direct_io=False --nooverwritepercent=1 --num_file_reads_for_auto_readahead=0 --open_files=-1 --open_metadata_read_fault_one_in=0 --open_metadata_write_fault_one_in=0 --open_read_fault_one_in=0 --open_write_fault_one_in=0 --ops_per_thread=20000000 --optimize_filters_for_hits=1 --optimize_filters_for_memory=0 --optimize_multiget_for_io=1 --paranoid_file_checks=1 --partition_filters=0 --partition_pinning=1 --pause_background_one_in=10000 --periodic_compaction_seconds=10 --prefix_size=8 --prefixpercent=0 --prepopulate_blob_cache=1 --prepopulate_block_cache=1 --preserve_internal_time_seconds=0 --progress_reports=0 --promote_l0_one_in=0 --read_amp_bytes_per_bit=0 --read_fault_one_in=0 --readahead_size=524288 --readpercent=60 --recycle_log_file_num=1 --reopen=20 --report_bg_io_stats=0 --reset_stats_one_in=1000000 --sample_for_compression=5 --secondary_cache_fault_one_in=0 --secondary_cache_uri= --skip_stats_update_on_db_open=1 --snapshot_hold_ops=100000 --soft_pending_compaction_bytes_limit=68719476736 --sqfc_name=foo --sqfc_version=1 --sst_file_manager_bytes_per_sec=0 --sst_file_manager_bytes_per_truncate=0 --stats_dump_period_sec=10 --stats_history_buffer_size=1048576 --strict_bytes_per_sync=1 --subcompactions=2 --sync=0 --sync_fault_injection=0 --table_cache_numshardbits=0 --target_file_size_base=16777216 --target_file_size_multiplier=1 --test_batches_snapshots=0 --top_level_index_pinning=3 --uncache_aggressiveness=118 --universal_max_read_amp=-1 --unpartitioned_pinning=0 --use_adaptive_mutex=0 --use_adaptive_mutex_lru=1 --use_attribute_group=0 --use_blob_cache=0 --use_delta_encoding=1 --use_direct_io_for_flush_and_compaction=0 --use_direct_reads=0 --use_full_merge_v1=0 --use_get_entity=0 --use_merge=0 --use_multi_cf_iterator=0 --use_multi_get_entity=0 --use_multiget=1 --use_put_entity_one_in=0 --use_shared_block_and_blob_cache=1 --use_sqfc_for_range_queries=1 --use_timed_put_one_in=0 --use_write_buffer_manager=0 --user_timestamp_size=0 --value_size_mult=32 --verification_only=0 --verify_checksum=1 --verify_checksum_one_in=1000000 --verify_compression=0 --verify_db_one_in=10000 --verify_file_checksums_one_in=1000000 --verify_iterator_with_expected_state_one_in=5 --verify_sst_unique_id_in_manifest=1 --wal_bytes_per_sync=0 --wal_compression=none --write_buffer_size=33554432 --write_dbid_to_manifest=0 --write_fault_one_in=0 --writepercent=35
```
Reviewed By: cbi42
Differential Revision: D60890580
Pulled By: hx235
fbshipit-source-id: 401f90d6d351c7ee11088cad06fb00e54062d416
2024-08-09 21:51:36 +00:00
|
|
|
UpdateIfInitialWriteFails(db_stress_env, s, &initial_write_s,
|
|
|
|
&initial_wal_write_may_succeed,
|
|
|
|
&wait_for_recover_start_time);
|
2024-07-29 20:51:49 +00:00
|
|
|
} while (!s.ok() && IsErrorInjectedAndRetryable(s) &&
|
|
|
|
initial_wal_write_may_succeed);
|
2023-09-19 06:04:38 +00:00
|
|
|
|
2019-12-09 07:49:32 +00:00
|
|
|
if (!s.ok()) {
|
2024-01-16 21:28:51 +00:00
|
|
|
pending_expected_value.Rollback();
|
2024-06-25 03:51:39 +00:00
|
|
|
if (IsErrorInjectedAndRetryable(s)) {
|
2024-07-29 20:51:49 +00:00
|
|
|
assert(!initial_wal_write_may_succeed);
|
Inject more errors to more files in stress test (#12713)
Summary:
**Context:**
We currently have partial error injection:
- DB operation: all read, SST write
- DB open: all read, SST write, all metadata write.
This PR completes the error injection (with some limitations below):
- DB operation & open: all read, all write, all metadata write, all metadata read
**Summary:**
- Inject retryable metadata read, metadata write error concerning directory (e.g, dir sync, ) or file metadata (e.g, name, size, file creation/deletion...)
- Inject retryable errors to all major file types: random access file, sequential file, writable file
- Allow db stress test operations to handle above injected errors gracefully without crashing
- Change all error injection to thread-local implementation for easier disabling and enabling in the same thread. For example, we can control error handling thread to have no error injection. It's also cleaner in code.
- Limitation: compared to before, we now don't have write fault injection for backup/restore CopyOrCreateFiles work threads since they use anonymous background threads as well as read injection for db open bg thread
- Add a new flag to test error recovery without error injection so we can test the path where error recovery actually succeeds
- Some Refactory & fix to db stress test framework (see PR review comments)
- Fix some minor bugs surfaced (see PR review comments)
- Limitation: had to disable backup restore with metadata read/write injection since it surfaces too many testing issues. Will add it back later to focus on surfacing actual code/internal bugs first.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12713
Test Plan:
- Existing UT
- CI with no trivial error failure
Reviewed By: pdillinger
Differential Revision: D58326608
Pulled By: hx235
fbshipit-source-id: 011b5195aaeb6011641ae0a9194f7f2a0e325ad7
2024-06-19 15:42:00 +00:00
|
|
|
return s;
|
|
|
|
} else if (FLAGS_inject_error_severity == 2) {
|
2021-07-01 21:15:49 +00:00
|
|
|
if (!is_db_stopped_ &&
|
|
|
|
s.severity() >= Status::Severity::kFatalError) {
|
|
|
|
is_db_stopped_ = true;
|
|
|
|
} else if (!is_db_stopped_ ||
|
|
|
|
s.severity() < Status::Severity::kFatalError) {
|
|
|
|
fprintf(stderr, "single delete error: %s\n", s.ToString().c_str());
|
Allow TryAgain in db_stress with optimistic txn, and refactoring (#11653)
Summary:
In rare cases, optimistic transaction commit returns TryAgain. This change tolerates that intentional behavior in db_stress, up to a small limit in a row. This way, we don't miss a possible regression with excessive TryAgain, and trying again (rolling back the transaction) should have a well renewed chance of success as the writes will be associated with fresh sequence numbers.
Also, some of the APIs were not clear about Transaction semantics, so I have clarified:
* (Best I can tell....) Destroying a Transaction is safe without calling Rollback() (or at least should be). I don't know why it's a common pattern in our test code and examples to rollback before unconditional destruction. Stress test updated not to call Rollback unnecessarily (to test safe destruction).
* Despite essentially doing what is asked, simply trying Commit() again when it returns TryAgain does not have a chance of success, because of the transaction being bound to the DB state at the time of operations before Commit. Similar logic applies to Busy AFAIK. Commit() API comments updated, and expanded unit test in optimistic_transaction_test.
Also also, because I can't stop myself, I refactored a good portion of the transaction handling code in db_stress.
* Avoid existing and new copy-paste for most transaction interactions with a new ExecuteTransaction (higher-order) function.
* Use unique_ptr (nicely complements removing unnecessary Rollbacks)
* Abstract out a pattern for safely calling std::terminate() and use it in more places. (The TryAgain errors we saw did not have stack traces because of "terminate called recursively".)
Intended follow-up: resurrect use of `FLAGS_rollback_one_in` but also include non-trivial cases
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11653
Test Plan:
this is the test :)
Also, temporarily bypassed the new retry logic and boosted the chance of hitting TryAgain. Quickly reproduced the TryAgain error. Then re-enabled the new retry logic, and was not able to hit the error after running for tens of minutes, even with the boosted chances.
Reviewed By: cbi42
Differential Revision: D47882995
Pulled By: pdillinger
fbshipit-source-id: 21eadb1525423340dbf28d17cf166b9583311a0d
2023-07-28 23:25:29 +00:00
|
|
|
thread->shared->SafeTerminate();
|
2021-07-01 21:15:49 +00:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
fprintf(stderr, "single delete error: %s\n", s.ToString().c_str());
|
Allow TryAgain in db_stress with optimistic txn, and refactoring (#11653)
Summary:
In rare cases, optimistic transaction commit returns TryAgain. This change tolerates that intentional behavior in db_stress, up to a small limit in a row. This way, we don't miss a possible regression with excessive TryAgain, and trying again (rolling back the transaction) should have a well renewed chance of success as the writes will be associated with fresh sequence numbers.
Also, some of the APIs were not clear about Transaction semantics, so I have clarified:
* (Best I can tell....) Destroying a Transaction is safe without calling Rollback() (or at least should be). I don't know why it's a common pattern in our test code and examples to rollback before unconditional destruction. Stress test updated not to call Rollback unnecessarily (to test safe destruction).
* Despite essentially doing what is asked, simply trying Commit() again when it returns TryAgain does not have a chance of success, because of the transaction being bound to the DB state at the time of operations before Commit. Similar logic applies to Busy AFAIK. Commit() API comments updated, and expanded unit test in optimistic_transaction_test.
Also also, because I can't stop myself, I refactored a good portion of the transaction handling code in db_stress.
* Avoid existing and new copy-paste for most transaction interactions with a new ExecuteTransaction (higher-order) function.
* Use unique_ptr (nicely complements removing unnecessary Rollbacks)
* Abstract out a pattern for safely calling std::terminate() and use it in more places. (The TryAgain errors we saw did not have stack traces because of "terminate called recursively".)
Intended follow-up: resurrect use of `FLAGS_rollback_one_in` but also include non-trivial cases
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11653
Test Plan:
this is the test :)
Also, temporarily bypassed the new retry logic and boosted the chance of hitting TryAgain. Quickly reproduced the TryAgain error. Then re-enabled the new retry logic, and was not able to hit the error after running for tens of minutes, even with the boosted chances.
Reviewed By: cbi42
Differential Revision: D47882995
Pulled By: pdillinger
fbshipit-source-id: 21eadb1525423340dbf28d17cf166b9583311a0d
2023-07-28 23:25:29 +00:00
|
|
|
thread->shared->SafeTerminate();
|
2021-07-01 21:15:49 +00:00
|
|
|
}
|
2024-07-29 20:51:49 +00:00
|
|
|
} else {
|
Fix false-positive TestBackupRestore corruption (#12917)
Summary:
**Context:**
https://github.com/facebook/rocksdb/pull/12838 allows a write thread encountered certain injected error to release the lock and sleep before retrying write in order to reduce performance cost. This requires adding checks like [this](https://github.com/facebook/rocksdb/blob/b26b395e0a15255d322be08110db551976188745/db_stress_tool/expected_value.cc#L29-L31) to prevent writing to the same key from another thread.
The added check causes a false-positive failure when delete range + file ingestion + backup is used. Consider the following scenario:
(1) Issue a delete range covering some key that do not exist and a key does exist (named as k1). k1 will have "pending delete" state while the keys that does not exit will have whatever state they already have since we don't delete a key that does not exist already.
(2) After https://github.com/facebook/rocksdb/pull/12838, `PrepareDeleteRange(... &prepared)` will return `prepared = false`. So below logic will be executed and k1's "pending delete" won't get roll-backed nor committed.
```
std::vector<PendingExpectedValue> pending_expected_values =
shared->PrepareDeleteRange(rand_column_family, rand_key,
rand_key + FLAGS_range_deletion_width,
&prepared);
if (!prepared) {
for (PendingExpectedValue& pending_expected_value :
pending_expected_values) {
pending_expected_value.PermitUnclosedPendingState();
}
return s;
}
```
(3) Issue an file ingestion covering k1 and another key k2. Similar to (2), we will have `shared->PreparePut(column_family, key, &prepared)` return `prepared = false` for k1 while k2 will have a "pending put" state. So below logic will be executed and k2's "pending put" state won't get roll-backed nor committed.
```
for (int64_t key = key_base;
s.ok() && key < shared->GetMaxKey() &&
static_cast<int32_t>(keys.size()) < FLAGS_ingest_external_file_width;
++key)
PendingExpectedValue pending_expected_value =
shared->PreparePut(column_family, key, &prepared);
if (!prepared) {
pending_expected_value.PermitUnclosedPendingState();
for (PendingExpectedValue& pev : pending_expected_values) {
pev.PermitUnclosedPendingState();
}
return;
}
}
```
(4) Issue a backup and verify on k2. Below logic decides that k2 should exist in restored DB since it has a pending write state while k2 is never ingested into the original DB as (3) returns early.
```
bool Exists() const { return PendingPut() || !IsDeleted(); }
TestBackupRestore() {
...
Status get_status = restored_db->Get(
read_opts, restored_cf_handles[rand_column_families[i]], key,
&restored_value);
bool exists = thread->shared->Exists(rand_column_families[i], rand_keys[0]);
if (get_status.ok()) {
if (!exists && from_latest && ShouldAcquireMutexOnKey()) {
std::ostringstream oss;
oss << "0x" << key.ToString(true)
<< " exists in restore but not in original db";
s = Status::Corruption(oss.str());
}
} else if (get_status.IsNotFound()) {
if (exists && from_latest && ShouldAcquireMutexOnKey()) {
std::ostringstream oss;
oss << "0x" << key.ToString(true)
<< " exists in original db but not in restore";
s = Status::Corruption(oss.str());
}
}
...
}
```
So we see false-positive corruption like `Failure in a backup/restore operation with: Corruption: 0x000000000000017B0000000000000073787878 exists in original db but not in restore`
A simple fix is to remove `PendingPut()` from `bool Exists() ` since it's called under a lock and should never see a pending write. However, in order for "under a lock and should never see a pending write" to be true, we need to remove the logic of releasing the lock during sleep in the write thread, which expose pending write to other thread that can call Exists() like back up thread.
The downside of holding lock during sleep is blocking other write thread of the same key to proceed cuz they need to wait for the lock. This should happen rarely as the key of a thread is selected randomly in crash test like below.
```
void StressTest::OperateDb(ThreadState* thread) {
for (uint64_t i = 0; i < ops_per_open; i++) {
...
int64_t rand_key = GenerateOneKey(thread, i);
...
}
}
```
**Summary:**
- Removed the "lock release" part and related checks
- Printed recovery time if the write thread waited more than 10 seconds
- Reverted regression in testing coverage when deleting a non-existent key
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12917
Test Plan:
Below command repro-ed frequently before the fix and not after.
```
./db_stress --WAL_size_limit_MB=1 --WAL_ttl_seconds=60 --acquire_snapshot_one_in=0 --adaptive_readahead=0 --adm_policy=1 --advise_random_on_open=1 --allow_concurrent_memtable_write=0 --allow_data_in_errors=True --allow_fallocate=0 --allow_setting_blob_options_dynamically=1 --async_io=0 --auto_readahead_size=1 --avoid_flush_during_recovery=0 --avoid_flush_during_shutdown=0 --avoid_unnecessary_blocking_io=0 --backup_max_size=104857600 --backup_one_in=100000 --batch_protection_bytes_per_key=0 --bgerror_resume_retry_interval=100 --blob_cache_size=8388608 --blob_compaction_readahead_size=1048576 --blob_compression_type=none --blob_file_size=1073741824 --blob_file_starting_level=1 --blob_garbage_collection_age_cutoff=0.0 --blob_garbage_collection_force_threshold=0.75 --block_align=0 --block_protection_bytes_per_key=8 --block_size=16384 --bloom_before_level=2147483647 --bloom_bits=16.216959977115277 --bottommost_compression_type=xpress --bottommost_file_compaction_delay=600 --bytes_per_sync=262144 --cache_index_and_filter_blocks=1 --cache_index_and_filter_blocks_with_high_priority=1 --cache_size=8388608 --cache_type=lru_cache --charge_compression_dictionary_building_buffer=1 --charge_file_metadata=0 --charge_filter_construction=0 --charge_table_reader=1 --check_multiget_consistency=0 --check_multiget_entity_consistency=0 --checkpoint_one_in=1000000 --checksum_type=kXXH3 --clear_column_family_one_in=0 --column_families=1 --compact_files_one_in=1000 --compact_range_one_in=0 --compaction_pri=3 --compaction_readahead_size=0 --compaction_ttl=10 --compress_format_version=2 --compressed_secondary_cache_size=8388608 --compression_checksum=0 --compression_max_dict_buffer_bytes=2097151 --compression_max_dict_bytes=16384 --compression_parallel_threads=1 --compression_type=zlib --compression_use_zstd_dict_trainer=0 --compression_zstd_max_train_bytes=0 --continuous_verification_interval=0 --daily_offpeak_time_utc=04:00-08:00 --data_block_index_type=0 --db=/dev/shm/rocksdb_test/rocksdb_crashtest_whitebox --db_write_buffer_size=0 --default_temperature=kUnknown --default_write_temperature=kWarm --delete_obsolete_files_period_micros=21600000000 --delpercent=0 --delrangepercent=5 --destroy_db_initially=0 --detect_filter_construct_corruption=1 --disable_file_deletions_one_in=10000 --disable_manual_compaction_one_in=1000000 --disable_wal=0 --dump_malloc_stats=0 --enable_blob_files=0 --enable_blob_garbage_collection=1 --enable_checksum_handoff=1 --enable_compaction_filter=1 --enable_custom_split_merge=1 --enable_do_not_compress_roles=0 --enable_index_compression=1 --enable_memtable_insert_with_hint_prefix_extractor=0 --enable_pipelined_write=1 --enable_sst_partitioner_factory=1 --enable_thread_tracking=0 --enable_write_thread_adaptive_yield=0 --error_recovery_with_no_fault_injection=1 --exclude_wal_from_write_fault_injection=1 --expected_values_dir=/dev/shm/rocksdb_test/rocksdb_crashtest_expected --fail_if_options_file_error=0 --fifo_allow_compaction=1 --file_checksum_impl=big --fill_cache=1 --flush_one_in=1000000 --format_version=2 --get_all_column_family_metadata_one_in=10000 --get_current_wal_file_one_in=0 --get_live_files_apis_one_in=1000000 --get_properties_of_all_tables_one_in=100000 --get_property_one_in=100000 --get_sorted_wal_files_one_in=0 --hard_pending_compaction_bytes_limit=2097152 --high_pri_pool_ratio=0.5 --index_block_restart_interval=1 --index_shortening=2 --index_type=0 --ingest_external_file_one_in=1000 --initial_auto_readahead_size=0 --inplace_update_support=0 --iterpercent=0 --key_len_percent_dist=1,30,69 --key_may_exist_one_in=100 --last_level_temperature=kUnknown --level_compaction_dynamic_level_bytes=0 --lock_wal_one_in=10000 --log2_keys_per_lock=10 --log_file_time_to_roll=0 --log_readahead_size=0 --long_running_snapshots=1 --low_pri_pool_ratio=0.5 --lowest_used_cache_tier=1 --manifest_preallocation_size=0 --manual_wal_flush_one_in=0 --mark_for_compaction_one_file_in=10 --max_auto_readahead_size=16384 --max_background_compactions=1 --max_bytes_for_level_base=67108864 --max_key=100000 --max_key_len=3 --max_log_file_size=1048576 --max_manifest_file_size=1073741824 --max_sequential_skip_in_iterations=16 --max_total_wal_size=0 --max_write_batch_group_size_bytes=16 --max_write_buffer_number=10 --max_write_buffer_size_to_maintain=8388608 --memtable_insert_hint_per_batch=1 --memtable_max_range_deletions=1000 --memtable_prefix_bloom_size_ratio=0.001 --memtable_protection_bytes_per_key=4 --memtable_whole_key_filtering=1 --memtablerep=skip_list --metadata_charge_policy=1 --metadata_read_fault_one_in=0 --metadata_write_fault_one_in=0 --min_blob_size=16 --min_write_buffer_number_to_merge=2 --mmap_read=0 --mock_direct_io=False --nooverwritepercent=1 --num_file_reads_for_auto_readahead=0 --open_files=-1 --open_metadata_read_fault_one_in=0 --open_metadata_write_fault_one_in=0 --open_read_fault_one_in=0 --open_write_fault_one_in=0 --ops_per_thread=20000000 --optimize_filters_for_hits=1 --optimize_filters_for_memory=0 --optimize_multiget_for_io=1 --paranoid_file_checks=1 --partition_filters=0 --partition_pinning=1 --pause_background_one_in=10000 --periodic_compaction_seconds=10 --prefix_size=8 --prefixpercent=0 --prepopulate_blob_cache=1 --prepopulate_block_cache=1 --preserve_internal_time_seconds=0 --progress_reports=0 --promote_l0_one_in=0 --read_amp_bytes_per_bit=0 --read_fault_one_in=0 --readahead_size=524288 --readpercent=60 --recycle_log_file_num=1 --reopen=20 --report_bg_io_stats=0 --reset_stats_one_in=1000000 --sample_for_compression=5 --secondary_cache_fault_one_in=0 --secondary_cache_uri= --skip_stats_update_on_db_open=1 --snapshot_hold_ops=100000 --soft_pending_compaction_bytes_limit=68719476736 --sqfc_name=foo --sqfc_version=1 --sst_file_manager_bytes_per_sec=0 --sst_file_manager_bytes_per_truncate=0 --stats_dump_period_sec=10 --stats_history_buffer_size=1048576 --strict_bytes_per_sync=1 --subcompactions=2 --sync=0 --sync_fault_injection=0 --table_cache_numshardbits=0 --target_file_size_base=16777216 --target_file_size_multiplier=1 --test_batches_snapshots=0 --top_level_index_pinning=3 --uncache_aggressiveness=118 --universal_max_read_amp=-1 --unpartitioned_pinning=0 --use_adaptive_mutex=0 --use_adaptive_mutex_lru=1 --use_attribute_group=0 --use_blob_cache=0 --use_delta_encoding=1 --use_direct_io_for_flush_and_compaction=0 --use_direct_reads=0 --use_full_merge_v1=0 --use_get_entity=0 --use_merge=0 --use_multi_cf_iterator=0 --use_multi_get_entity=0 --use_multiget=1 --use_put_entity_one_in=0 --use_shared_block_and_blob_cache=1 --use_sqfc_for_range_queries=1 --use_timed_put_one_in=0 --use_write_buffer_manager=0 --user_timestamp_size=0 --value_size_mult=32 --verification_only=0 --verify_checksum=1 --verify_checksum_one_in=1000000 --verify_compression=0 --verify_db_one_in=10000 --verify_file_checksums_one_in=1000000 --verify_iterator_with_expected_state_one_in=5 --verify_sst_unique_id_in_manifest=1 --wal_bytes_per_sync=0 --wal_compression=none --write_buffer_size=33554432 --write_dbid_to_manifest=0 --write_fault_one_in=0 --writepercent=35
```
Reviewed By: cbi42
Differential Revision: D60890580
Pulled By: hx235
fbshipit-source-id: 401f90d6d351c7ee11088cad06fb00e54062d416
2024-08-09 21:51:36 +00:00
|
|
|
PrintWriteRecoveryWaitTimeIfNeeded(
|
|
|
|
db_stress_env, initial_write_s, initial_wal_write_may_succeed,
|
|
|
|
wait_for_recover_start_time, "TestDelete");
|
2024-07-29 20:51:49 +00:00
|
|
|
pending_expected_value.Commit();
|
|
|
|
thread->stats.AddSingleDeletes(1);
|
2019-12-09 07:49:32 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2019-12-09 22:36:10 +00:00
|
|
|
Status TestDeleteRange(ThreadState* thread, WriteOptions& write_opts,
|
|
|
|
const std::vector<int>& rand_column_families,
|
2022-09-15 22:55:37 +00:00
|
|
|
const std::vector<int64_t>& rand_keys) override {
|
2019-12-09 07:49:32 +00:00
|
|
|
// OPERATION delete range
|
|
|
|
std::vector<std::unique_ptr<MutexLock>> range_locks;
|
|
|
|
// delete range does not respect disallowed overwrites. the keys for
|
|
|
|
// which overwrites are disallowed are randomly distributed so it
|
|
|
|
// could be expensive to find a range where each key allows
|
|
|
|
// overwrites.
|
|
|
|
int64_t rand_key = rand_keys[0];
|
|
|
|
int rand_column_family = rand_column_families[0];
|
|
|
|
auto shared = thread->shared;
|
|
|
|
int64_t max_key = shared->GetMaxKey();
|
|
|
|
if (rand_key > max_key - FLAGS_range_deletion_width) {
|
|
|
|
rand_key =
|
|
|
|
thread->rand.Next() % (max_key - FLAGS_range_deletion_width + 1);
|
|
|
|
}
|
2024-07-29 20:51:49 +00:00
|
|
|
GetDeleteRangeKeyLocks(thread, rand_column_family, rand_key, &range_locks);
|
|
|
|
|
|
|
|
// To track the final write status
|
|
|
|
Status s;
|
|
|
|
// To track the initial write status
|
|
|
|
Status initial_write_s;
|
|
|
|
// To track whether WAL write may have succeeded during the initial failed
|
|
|
|
// write
|
|
|
|
bool initial_wal_write_may_succeed = true;
|
|
|
|
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
std::vector<PendingExpectedValue> pending_expected_values =
|
|
|
|
shared->PrepareDeleteRange(rand_column_family, rand_key,
|
Fix false-positive TestBackupRestore corruption (#12917)
Summary:
**Context:**
https://github.com/facebook/rocksdb/pull/12838 allows a write thread encountered certain injected error to release the lock and sleep before retrying write in order to reduce performance cost. This requires adding checks like [this](https://github.com/facebook/rocksdb/blob/b26b395e0a15255d322be08110db551976188745/db_stress_tool/expected_value.cc#L29-L31) to prevent writing to the same key from another thread.
The added check causes a false-positive failure when delete range + file ingestion + backup is used. Consider the following scenario:
(1) Issue a delete range covering some key that do not exist and a key does exist (named as k1). k1 will have "pending delete" state while the keys that does not exit will have whatever state they already have since we don't delete a key that does not exist already.
(2) After https://github.com/facebook/rocksdb/pull/12838, `PrepareDeleteRange(... &prepared)` will return `prepared = false`. So below logic will be executed and k1's "pending delete" won't get roll-backed nor committed.
```
std::vector<PendingExpectedValue> pending_expected_values =
shared->PrepareDeleteRange(rand_column_family, rand_key,
rand_key + FLAGS_range_deletion_width,
&prepared);
if (!prepared) {
for (PendingExpectedValue& pending_expected_value :
pending_expected_values) {
pending_expected_value.PermitUnclosedPendingState();
}
return s;
}
```
(3) Issue an file ingestion covering k1 and another key k2. Similar to (2), we will have `shared->PreparePut(column_family, key, &prepared)` return `prepared = false` for k1 while k2 will have a "pending put" state. So below logic will be executed and k2's "pending put" state won't get roll-backed nor committed.
```
for (int64_t key = key_base;
s.ok() && key < shared->GetMaxKey() &&
static_cast<int32_t>(keys.size()) < FLAGS_ingest_external_file_width;
++key)
PendingExpectedValue pending_expected_value =
shared->PreparePut(column_family, key, &prepared);
if (!prepared) {
pending_expected_value.PermitUnclosedPendingState();
for (PendingExpectedValue& pev : pending_expected_values) {
pev.PermitUnclosedPendingState();
}
return;
}
}
```
(4) Issue a backup and verify on k2. Below logic decides that k2 should exist in restored DB since it has a pending write state while k2 is never ingested into the original DB as (3) returns early.
```
bool Exists() const { return PendingPut() || !IsDeleted(); }
TestBackupRestore() {
...
Status get_status = restored_db->Get(
read_opts, restored_cf_handles[rand_column_families[i]], key,
&restored_value);
bool exists = thread->shared->Exists(rand_column_families[i], rand_keys[0]);
if (get_status.ok()) {
if (!exists && from_latest && ShouldAcquireMutexOnKey()) {
std::ostringstream oss;
oss << "0x" << key.ToString(true)
<< " exists in restore but not in original db";
s = Status::Corruption(oss.str());
}
} else if (get_status.IsNotFound()) {
if (exists && from_latest && ShouldAcquireMutexOnKey()) {
std::ostringstream oss;
oss << "0x" << key.ToString(true)
<< " exists in original db but not in restore";
s = Status::Corruption(oss.str());
}
}
...
}
```
So we see false-positive corruption like `Failure in a backup/restore operation with: Corruption: 0x000000000000017B0000000000000073787878 exists in original db but not in restore`
A simple fix is to remove `PendingPut()` from `bool Exists() ` since it's called under a lock and should never see a pending write. However, in order for "under a lock and should never see a pending write" to be true, we need to remove the logic of releasing the lock during sleep in the write thread, which expose pending write to other thread that can call Exists() like back up thread.
The downside of holding lock during sleep is blocking other write thread of the same key to proceed cuz they need to wait for the lock. This should happen rarely as the key of a thread is selected randomly in crash test like below.
```
void StressTest::OperateDb(ThreadState* thread) {
for (uint64_t i = 0; i < ops_per_open; i++) {
...
int64_t rand_key = GenerateOneKey(thread, i);
...
}
}
```
**Summary:**
- Removed the "lock release" part and related checks
- Printed recovery time if the write thread waited more than 10 seconds
- Reverted regression in testing coverage when deleting a non-existent key
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12917
Test Plan:
Below command repro-ed frequently before the fix and not after.
```
./db_stress --WAL_size_limit_MB=1 --WAL_ttl_seconds=60 --acquire_snapshot_one_in=0 --adaptive_readahead=0 --adm_policy=1 --advise_random_on_open=1 --allow_concurrent_memtable_write=0 --allow_data_in_errors=True --allow_fallocate=0 --allow_setting_blob_options_dynamically=1 --async_io=0 --auto_readahead_size=1 --avoid_flush_during_recovery=0 --avoid_flush_during_shutdown=0 --avoid_unnecessary_blocking_io=0 --backup_max_size=104857600 --backup_one_in=100000 --batch_protection_bytes_per_key=0 --bgerror_resume_retry_interval=100 --blob_cache_size=8388608 --blob_compaction_readahead_size=1048576 --blob_compression_type=none --blob_file_size=1073741824 --blob_file_starting_level=1 --blob_garbage_collection_age_cutoff=0.0 --blob_garbage_collection_force_threshold=0.75 --block_align=0 --block_protection_bytes_per_key=8 --block_size=16384 --bloom_before_level=2147483647 --bloom_bits=16.216959977115277 --bottommost_compression_type=xpress --bottommost_file_compaction_delay=600 --bytes_per_sync=262144 --cache_index_and_filter_blocks=1 --cache_index_and_filter_blocks_with_high_priority=1 --cache_size=8388608 --cache_type=lru_cache --charge_compression_dictionary_building_buffer=1 --charge_file_metadata=0 --charge_filter_construction=0 --charge_table_reader=1 --check_multiget_consistency=0 --check_multiget_entity_consistency=0 --checkpoint_one_in=1000000 --checksum_type=kXXH3 --clear_column_family_one_in=0 --column_families=1 --compact_files_one_in=1000 --compact_range_one_in=0 --compaction_pri=3 --compaction_readahead_size=0 --compaction_ttl=10 --compress_format_version=2 --compressed_secondary_cache_size=8388608 --compression_checksum=0 --compression_max_dict_buffer_bytes=2097151 --compression_max_dict_bytes=16384 --compression_parallel_threads=1 --compression_type=zlib --compression_use_zstd_dict_trainer=0 --compression_zstd_max_train_bytes=0 --continuous_verification_interval=0 --daily_offpeak_time_utc=04:00-08:00 --data_block_index_type=0 --db=/dev/shm/rocksdb_test/rocksdb_crashtest_whitebox --db_write_buffer_size=0 --default_temperature=kUnknown --default_write_temperature=kWarm --delete_obsolete_files_period_micros=21600000000 --delpercent=0 --delrangepercent=5 --destroy_db_initially=0 --detect_filter_construct_corruption=1 --disable_file_deletions_one_in=10000 --disable_manual_compaction_one_in=1000000 --disable_wal=0 --dump_malloc_stats=0 --enable_blob_files=0 --enable_blob_garbage_collection=1 --enable_checksum_handoff=1 --enable_compaction_filter=1 --enable_custom_split_merge=1 --enable_do_not_compress_roles=0 --enable_index_compression=1 --enable_memtable_insert_with_hint_prefix_extractor=0 --enable_pipelined_write=1 --enable_sst_partitioner_factory=1 --enable_thread_tracking=0 --enable_write_thread_adaptive_yield=0 --error_recovery_with_no_fault_injection=1 --exclude_wal_from_write_fault_injection=1 --expected_values_dir=/dev/shm/rocksdb_test/rocksdb_crashtest_expected --fail_if_options_file_error=0 --fifo_allow_compaction=1 --file_checksum_impl=big --fill_cache=1 --flush_one_in=1000000 --format_version=2 --get_all_column_family_metadata_one_in=10000 --get_current_wal_file_one_in=0 --get_live_files_apis_one_in=1000000 --get_properties_of_all_tables_one_in=100000 --get_property_one_in=100000 --get_sorted_wal_files_one_in=0 --hard_pending_compaction_bytes_limit=2097152 --high_pri_pool_ratio=0.5 --index_block_restart_interval=1 --index_shortening=2 --index_type=0 --ingest_external_file_one_in=1000 --initial_auto_readahead_size=0 --inplace_update_support=0 --iterpercent=0 --key_len_percent_dist=1,30,69 --key_may_exist_one_in=100 --last_level_temperature=kUnknown --level_compaction_dynamic_level_bytes=0 --lock_wal_one_in=10000 --log2_keys_per_lock=10 --log_file_time_to_roll=0 --log_readahead_size=0 --long_running_snapshots=1 --low_pri_pool_ratio=0.5 --lowest_used_cache_tier=1 --manifest_preallocation_size=0 --manual_wal_flush_one_in=0 --mark_for_compaction_one_file_in=10 --max_auto_readahead_size=16384 --max_background_compactions=1 --max_bytes_for_level_base=67108864 --max_key=100000 --max_key_len=3 --max_log_file_size=1048576 --max_manifest_file_size=1073741824 --max_sequential_skip_in_iterations=16 --max_total_wal_size=0 --max_write_batch_group_size_bytes=16 --max_write_buffer_number=10 --max_write_buffer_size_to_maintain=8388608 --memtable_insert_hint_per_batch=1 --memtable_max_range_deletions=1000 --memtable_prefix_bloom_size_ratio=0.001 --memtable_protection_bytes_per_key=4 --memtable_whole_key_filtering=1 --memtablerep=skip_list --metadata_charge_policy=1 --metadata_read_fault_one_in=0 --metadata_write_fault_one_in=0 --min_blob_size=16 --min_write_buffer_number_to_merge=2 --mmap_read=0 --mock_direct_io=False --nooverwritepercent=1 --num_file_reads_for_auto_readahead=0 --open_files=-1 --open_metadata_read_fault_one_in=0 --open_metadata_write_fault_one_in=0 --open_read_fault_one_in=0 --open_write_fault_one_in=0 --ops_per_thread=20000000 --optimize_filters_for_hits=1 --optimize_filters_for_memory=0 --optimize_multiget_for_io=1 --paranoid_file_checks=1 --partition_filters=0 --partition_pinning=1 --pause_background_one_in=10000 --periodic_compaction_seconds=10 --prefix_size=8 --prefixpercent=0 --prepopulate_blob_cache=1 --prepopulate_block_cache=1 --preserve_internal_time_seconds=0 --progress_reports=0 --promote_l0_one_in=0 --read_amp_bytes_per_bit=0 --read_fault_one_in=0 --readahead_size=524288 --readpercent=60 --recycle_log_file_num=1 --reopen=20 --report_bg_io_stats=0 --reset_stats_one_in=1000000 --sample_for_compression=5 --secondary_cache_fault_one_in=0 --secondary_cache_uri= --skip_stats_update_on_db_open=1 --snapshot_hold_ops=100000 --soft_pending_compaction_bytes_limit=68719476736 --sqfc_name=foo --sqfc_version=1 --sst_file_manager_bytes_per_sec=0 --sst_file_manager_bytes_per_truncate=0 --stats_dump_period_sec=10 --stats_history_buffer_size=1048576 --strict_bytes_per_sync=1 --subcompactions=2 --sync=0 --sync_fault_injection=0 --table_cache_numshardbits=0 --target_file_size_base=16777216 --target_file_size_multiplier=1 --test_batches_snapshots=0 --top_level_index_pinning=3 --uncache_aggressiveness=118 --universal_max_read_amp=-1 --unpartitioned_pinning=0 --use_adaptive_mutex=0 --use_adaptive_mutex_lru=1 --use_attribute_group=0 --use_blob_cache=0 --use_delta_encoding=1 --use_direct_io_for_flush_and_compaction=0 --use_direct_reads=0 --use_full_merge_v1=0 --use_get_entity=0 --use_merge=0 --use_multi_cf_iterator=0 --use_multi_get_entity=0 --use_multiget=1 --use_put_entity_one_in=0 --use_shared_block_and_blob_cache=1 --use_sqfc_for_range_queries=1 --use_timed_put_one_in=0 --use_write_buffer_manager=0 --user_timestamp_size=0 --value_size_mult=32 --verification_only=0 --verify_checksum=1 --verify_checksum_one_in=1000000 --verify_compression=0 --verify_db_one_in=10000 --verify_file_checksums_one_in=1000000 --verify_iterator_with_expected_state_one_in=5 --verify_sst_unique_id_in_manifest=1 --wal_bytes_per_sync=0 --wal_compression=none --write_buffer_size=33554432 --write_dbid_to_manifest=0 --write_fault_one_in=0 --writepercent=35
```
Reviewed By: cbi42
Differential Revision: D60890580
Pulled By: hx235
fbshipit-source-id: 401f90d6d351c7ee11088cad06fb00e54062d416
2024-08-09 21:51:36 +00:00
|
|
|
rand_key + FLAGS_range_deletion_width);
|
2024-07-29 20:51:49 +00:00
|
|
|
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
const int covered = static_cast<int>(pending_expected_values.size());
|
2019-12-09 07:49:32 +00:00
|
|
|
std::string keystr = Key(rand_key);
|
|
|
|
Slice key = keystr;
|
|
|
|
auto cfh = column_families_[rand_column_family];
|
|
|
|
std::string end_keystr = Key(rand_key + FLAGS_range_deletion_width);
|
|
|
|
Slice end_key = end_keystr;
|
2022-09-30 23:13:03 +00:00
|
|
|
std::string write_ts_str;
|
|
|
|
Slice write_ts;
|
Fix false-positive TestBackupRestore corruption (#12917)
Summary:
**Context:**
https://github.com/facebook/rocksdb/pull/12838 allows a write thread encountered certain injected error to release the lock and sleep before retrying write in order to reduce performance cost. This requires adding checks like [this](https://github.com/facebook/rocksdb/blob/b26b395e0a15255d322be08110db551976188745/db_stress_tool/expected_value.cc#L29-L31) to prevent writing to the same key from another thread.
The added check causes a false-positive failure when delete range + file ingestion + backup is used. Consider the following scenario:
(1) Issue a delete range covering some key that do not exist and a key does exist (named as k1). k1 will have "pending delete" state while the keys that does not exit will have whatever state they already have since we don't delete a key that does not exist already.
(2) After https://github.com/facebook/rocksdb/pull/12838, `PrepareDeleteRange(... &prepared)` will return `prepared = false`. So below logic will be executed and k1's "pending delete" won't get roll-backed nor committed.
```
std::vector<PendingExpectedValue> pending_expected_values =
shared->PrepareDeleteRange(rand_column_family, rand_key,
rand_key + FLAGS_range_deletion_width,
&prepared);
if (!prepared) {
for (PendingExpectedValue& pending_expected_value :
pending_expected_values) {
pending_expected_value.PermitUnclosedPendingState();
}
return s;
}
```
(3) Issue an file ingestion covering k1 and another key k2. Similar to (2), we will have `shared->PreparePut(column_family, key, &prepared)` return `prepared = false` for k1 while k2 will have a "pending put" state. So below logic will be executed and k2's "pending put" state won't get roll-backed nor committed.
```
for (int64_t key = key_base;
s.ok() && key < shared->GetMaxKey() &&
static_cast<int32_t>(keys.size()) < FLAGS_ingest_external_file_width;
++key)
PendingExpectedValue pending_expected_value =
shared->PreparePut(column_family, key, &prepared);
if (!prepared) {
pending_expected_value.PermitUnclosedPendingState();
for (PendingExpectedValue& pev : pending_expected_values) {
pev.PermitUnclosedPendingState();
}
return;
}
}
```
(4) Issue a backup and verify on k2. Below logic decides that k2 should exist in restored DB since it has a pending write state while k2 is never ingested into the original DB as (3) returns early.
```
bool Exists() const { return PendingPut() || !IsDeleted(); }
TestBackupRestore() {
...
Status get_status = restored_db->Get(
read_opts, restored_cf_handles[rand_column_families[i]], key,
&restored_value);
bool exists = thread->shared->Exists(rand_column_families[i], rand_keys[0]);
if (get_status.ok()) {
if (!exists && from_latest && ShouldAcquireMutexOnKey()) {
std::ostringstream oss;
oss << "0x" << key.ToString(true)
<< " exists in restore but not in original db";
s = Status::Corruption(oss.str());
}
} else if (get_status.IsNotFound()) {
if (exists && from_latest && ShouldAcquireMutexOnKey()) {
std::ostringstream oss;
oss << "0x" << key.ToString(true)
<< " exists in original db but not in restore";
s = Status::Corruption(oss.str());
}
}
...
}
```
So we see false-positive corruption like `Failure in a backup/restore operation with: Corruption: 0x000000000000017B0000000000000073787878 exists in original db but not in restore`
A simple fix is to remove `PendingPut()` from `bool Exists() ` since it's called under a lock and should never see a pending write. However, in order for "under a lock and should never see a pending write" to be true, we need to remove the logic of releasing the lock during sleep in the write thread, which expose pending write to other thread that can call Exists() like back up thread.
The downside of holding lock during sleep is blocking other write thread of the same key to proceed cuz they need to wait for the lock. This should happen rarely as the key of a thread is selected randomly in crash test like below.
```
void StressTest::OperateDb(ThreadState* thread) {
for (uint64_t i = 0; i < ops_per_open; i++) {
...
int64_t rand_key = GenerateOneKey(thread, i);
...
}
}
```
**Summary:**
- Removed the "lock release" part and related checks
- Printed recovery time if the write thread waited more than 10 seconds
- Reverted regression in testing coverage when deleting a non-existent key
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12917
Test Plan:
Below command repro-ed frequently before the fix and not after.
```
./db_stress --WAL_size_limit_MB=1 --WAL_ttl_seconds=60 --acquire_snapshot_one_in=0 --adaptive_readahead=0 --adm_policy=1 --advise_random_on_open=1 --allow_concurrent_memtable_write=0 --allow_data_in_errors=True --allow_fallocate=0 --allow_setting_blob_options_dynamically=1 --async_io=0 --auto_readahead_size=1 --avoid_flush_during_recovery=0 --avoid_flush_during_shutdown=0 --avoid_unnecessary_blocking_io=0 --backup_max_size=104857600 --backup_one_in=100000 --batch_protection_bytes_per_key=0 --bgerror_resume_retry_interval=100 --blob_cache_size=8388608 --blob_compaction_readahead_size=1048576 --blob_compression_type=none --blob_file_size=1073741824 --blob_file_starting_level=1 --blob_garbage_collection_age_cutoff=0.0 --blob_garbage_collection_force_threshold=0.75 --block_align=0 --block_protection_bytes_per_key=8 --block_size=16384 --bloom_before_level=2147483647 --bloom_bits=16.216959977115277 --bottommost_compression_type=xpress --bottommost_file_compaction_delay=600 --bytes_per_sync=262144 --cache_index_and_filter_blocks=1 --cache_index_and_filter_blocks_with_high_priority=1 --cache_size=8388608 --cache_type=lru_cache --charge_compression_dictionary_building_buffer=1 --charge_file_metadata=0 --charge_filter_construction=0 --charge_table_reader=1 --check_multiget_consistency=0 --check_multiget_entity_consistency=0 --checkpoint_one_in=1000000 --checksum_type=kXXH3 --clear_column_family_one_in=0 --column_families=1 --compact_files_one_in=1000 --compact_range_one_in=0 --compaction_pri=3 --compaction_readahead_size=0 --compaction_ttl=10 --compress_format_version=2 --compressed_secondary_cache_size=8388608 --compression_checksum=0 --compression_max_dict_buffer_bytes=2097151 --compression_max_dict_bytes=16384 --compression_parallel_threads=1 --compression_type=zlib --compression_use_zstd_dict_trainer=0 --compression_zstd_max_train_bytes=0 --continuous_verification_interval=0 --daily_offpeak_time_utc=04:00-08:00 --data_block_index_type=0 --db=/dev/shm/rocksdb_test/rocksdb_crashtest_whitebox --db_write_buffer_size=0 --default_temperature=kUnknown --default_write_temperature=kWarm --delete_obsolete_files_period_micros=21600000000 --delpercent=0 --delrangepercent=5 --destroy_db_initially=0 --detect_filter_construct_corruption=1 --disable_file_deletions_one_in=10000 --disable_manual_compaction_one_in=1000000 --disable_wal=0 --dump_malloc_stats=0 --enable_blob_files=0 --enable_blob_garbage_collection=1 --enable_checksum_handoff=1 --enable_compaction_filter=1 --enable_custom_split_merge=1 --enable_do_not_compress_roles=0 --enable_index_compression=1 --enable_memtable_insert_with_hint_prefix_extractor=0 --enable_pipelined_write=1 --enable_sst_partitioner_factory=1 --enable_thread_tracking=0 --enable_write_thread_adaptive_yield=0 --error_recovery_with_no_fault_injection=1 --exclude_wal_from_write_fault_injection=1 --expected_values_dir=/dev/shm/rocksdb_test/rocksdb_crashtest_expected --fail_if_options_file_error=0 --fifo_allow_compaction=1 --file_checksum_impl=big --fill_cache=1 --flush_one_in=1000000 --format_version=2 --get_all_column_family_metadata_one_in=10000 --get_current_wal_file_one_in=0 --get_live_files_apis_one_in=1000000 --get_properties_of_all_tables_one_in=100000 --get_property_one_in=100000 --get_sorted_wal_files_one_in=0 --hard_pending_compaction_bytes_limit=2097152 --high_pri_pool_ratio=0.5 --index_block_restart_interval=1 --index_shortening=2 --index_type=0 --ingest_external_file_one_in=1000 --initial_auto_readahead_size=0 --inplace_update_support=0 --iterpercent=0 --key_len_percent_dist=1,30,69 --key_may_exist_one_in=100 --last_level_temperature=kUnknown --level_compaction_dynamic_level_bytes=0 --lock_wal_one_in=10000 --log2_keys_per_lock=10 --log_file_time_to_roll=0 --log_readahead_size=0 --long_running_snapshots=1 --low_pri_pool_ratio=0.5 --lowest_used_cache_tier=1 --manifest_preallocation_size=0 --manual_wal_flush_one_in=0 --mark_for_compaction_one_file_in=10 --max_auto_readahead_size=16384 --max_background_compactions=1 --max_bytes_for_level_base=67108864 --max_key=100000 --max_key_len=3 --max_log_file_size=1048576 --max_manifest_file_size=1073741824 --max_sequential_skip_in_iterations=16 --max_total_wal_size=0 --max_write_batch_group_size_bytes=16 --max_write_buffer_number=10 --max_write_buffer_size_to_maintain=8388608 --memtable_insert_hint_per_batch=1 --memtable_max_range_deletions=1000 --memtable_prefix_bloom_size_ratio=0.001 --memtable_protection_bytes_per_key=4 --memtable_whole_key_filtering=1 --memtablerep=skip_list --metadata_charge_policy=1 --metadata_read_fault_one_in=0 --metadata_write_fault_one_in=0 --min_blob_size=16 --min_write_buffer_number_to_merge=2 --mmap_read=0 --mock_direct_io=False --nooverwritepercent=1 --num_file_reads_for_auto_readahead=0 --open_files=-1 --open_metadata_read_fault_one_in=0 --open_metadata_write_fault_one_in=0 --open_read_fault_one_in=0 --open_write_fault_one_in=0 --ops_per_thread=20000000 --optimize_filters_for_hits=1 --optimize_filters_for_memory=0 --optimize_multiget_for_io=1 --paranoid_file_checks=1 --partition_filters=0 --partition_pinning=1 --pause_background_one_in=10000 --periodic_compaction_seconds=10 --prefix_size=8 --prefixpercent=0 --prepopulate_blob_cache=1 --prepopulate_block_cache=1 --preserve_internal_time_seconds=0 --progress_reports=0 --promote_l0_one_in=0 --read_amp_bytes_per_bit=0 --read_fault_one_in=0 --readahead_size=524288 --readpercent=60 --recycle_log_file_num=1 --reopen=20 --report_bg_io_stats=0 --reset_stats_one_in=1000000 --sample_for_compression=5 --secondary_cache_fault_one_in=0 --secondary_cache_uri= --skip_stats_update_on_db_open=1 --snapshot_hold_ops=100000 --soft_pending_compaction_bytes_limit=68719476736 --sqfc_name=foo --sqfc_version=1 --sst_file_manager_bytes_per_sec=0 --sst_file_manager_bytes_per_truncate=0 --stats_dump_period_sec=10 --stats_history_buffer_size=1048576 --strict_bytes_per_sync=1 --subcompactions=2 --sync=0 --sync_fault_injection=0 --table_cache_numshardbits=0 --target_file_size_base=16777216 --target_file_size_multiplier=1 --test_batches_snapshots=0 --top_level_index_pinning=3 --uncache_aggressiveness=118 --universal_max_read_amp=-1 --unpartitioned_pinning=0 --use_adaptive_mutex=0 --use_adaptive_mutex_lru=1 --use_attribute_group=0 --use_blob_cache=0 --use_delta_encoding=1 --use_direct_io_for_flush_and_compaction=0 --use_direct_reads=0 --use_full_merge_v1=0 --use_get_entity=0 --use_merge=0 --use_multi_cf_iterator=0 --use_multi_get_entity=0 --use_multiget=1 --use_put_entity_one_in=0 --use_shared_block_and_blob_cache=1 --use_sqfc_for_range_queries=1 --use_timed_put_one_in=0 --use_write_buffer_manager=0 --user_timestamp_size=0 --value_size_mult=32 --verification_only=0 --verify_checksum=1 --verify_checksum_one_in=1000000 --verify_compression=0 --verify_db_one_in=10000 --verify_file_checksums_one_in=1000000 --verify_iterator_with_expected_state_one_in=5 --verify_sst_unique_id_in_manifest=1 --wal_bytes_per_sync=0 --wal_compression=none --write_buffer_size=33554432 --write_dbid_to_manifest=0 --write_fault_one_in=0 --writepercent=35
```
Reviewed By: cbi42
Differential Revision: D60890580
Pulled By: hx235
fbshipit-source-id: 401f90d6d351c7ee11088cad06fb00e54062d416
2024-08-09 21:51:36 +00:00
|
|
|
uint64_t wait_for_recover_start_time = 0;
|
2024-07-29 20:51:49 +00:00
|
|
|
|
|
|
|
do {
|
|
|
|
// In order to commit the expected state for the initial write failed with
|
|
|
|
// injected retryable error and successful WAL write, retry the write
|
|
|
|
// until it succeeds after the recovery finishes
|
|
|
|
if (!s.ok() && IsErrorInjectedAndRetryable(s) &&
|
|
|
|
initial_wal_write_may_succeed) {
|
|
|
|
std::this_thread::sleep_for(std::chrono::microseconds(1 * 1000 * 1000));
|
|
|
|
}
|
|
|
|
if (FLAGS_user_timestamp_size) {
|
|
|
|
write_ts_str = GetNowNanos();
|
|
|
|
write_ts = write_ts_str;
|
|
|
|
s = db_->DeleteRange(write_opts, cfh, key, end_key, write_ts);
|
|
|
|
} else {
|
|
|
|
s = db_->DeleteRange(write_opts, cfh, key, end_key);
|
|
|
|
}
|
Fix false-positive TestBackupRestore corruption (#12917)
Summary:
**Context:**
https://github.com/facebook/rocksdb/pull/12838 allows a write thread encountered certain injected error to release the lock and sleep before retrying write in order to reduce performance cost. This requires adding checks like [this](https://github.com/facebook/rocksdb/blob/b26b395e0a15255d322be08110db551976188745/db_stress_tool/expected_value.cc#L29-L31) to prevent writing to the same key from another thread.
The added check causes a false-positive failure when delete range + file ingestion + backup is used. Consider the following scenario:
(1) Issue a delete range covering some key that do not exist and a key does exist (named as k1). k1 will have "pending delete" state while the keys that does not exit will have whatever state they already have since we don't delete a key that does not exist already.
(2) After https://github.com/facebook/rocksdb/pull/12838, `PrepareDeleteRange(... &prepared)` will return `prepared = false`. So below logic will be executed and k1's "pending delete" won't get roll-backed nor committed.
```
std::vector<PendingExpectedValue> pending_expected_values =
shared->PrepareDeleteRange(rand_column_family, rand_key,
rand_key + FLAGS_range_deletion_width,
&prepared);
if (!prepared) {
for (PendingExpectedValue& pending_expected_value :
pending_expected_values) {
pending_expected_value.PermitUnclosedPendingState();
}
return s;
}
```
(3) Issue an file ingestion covering k1 and another key k2. Similar to (2), we will have `shared->PreparePut(column_family, key, &prepared)` return `prepared = false` for k1 while k2 will have a "pending put" state. So below logic will be executed and k2's "pending put" state won't get roll-backed nor committed.
```
for (int64_t key = key_base;
s.ok() && key < shared->GetMaxKey() &&
static_cast<int32_t>(keys.size()) < FLAGS_ingest_external_file_width;
++key)
PendingExpectedValue pending_expected_value =
shared->PreparePut(column_family, key, &prepared);
if (!prepared) {
pending_expected_value.PermitUnclosedPendingState();
for (PendingExpectedValue& pev : pending_expected_values) {
pev.PermitUnclosedPendingState();
}
return;
}
}
```
(4) Issue a backup and verify on k2. Below logic decides that k2 should exist in restored DB since it has a pending write state while k2 is never ingested into the original DB as (3) returns early.
```
bool Exists() const { return PendingPut() || !IsDeleted(); }
TestBackupRestore() {
...
Status get_status = restored_db->Get(
read_opts, restored_cf_handles[rand_column_families[i]], key,
&restored_value);
bool exists = thread->shared->Exists(rand_column_families[i], rand_keys[0]);
if (get_status.ok()) {
if (!exists && from_latest && ShouldAcquireMutexOnKey()) {
std::ostringstream oss;
oss << "0x" << key.ToString(true)
<< " exists in restore but not in original db";
s = Status::Corruption(oss.str());
}
} else if (get_status.IsNotFound()) {
if (exists && from_latest && ShouldAcquireMutexOnKey()) {
std::ostringstream oss;
oss << "0x" << key.ToString(true)
<< " exists in original db but not in restore";
s = Status::Corruption(oss.str());
}
}
...
}
```
So we see false-positive corruption like `Failure in a backup/restore operation with: Corruption: 0x000000000000017B0000000000000073787878 exists in original db but not in restore`
A simple fix is to remove `PendingPut()` from `bool Exists() ` since it's called under a lock and should never see a pending write. However, in order for "under a lock and should never see a pending write" to be true, we need to remove the logic of releasing the lock during sleep in the write thread, which expose pending write to other thread that can call Exists() like back up thread.
The downside of holding lock during sleep is blocking other write thread of the same key to proceed cuz they need to wait for the lock. This should happen rarely as the key of a thread is selected randomly in crash test like below.
```
void StressTest::OperateDb(ThreadState* thread) {
for (uint64_t i = 0; i < ops_per_open; i++) {
...
int64_t rand_key = GenerateOneKey(thread, i);
...
}
}
```
**Summary:**
- Removed the "lock release" part and related checks
- Printed recovery time if the write thread waited more than 10 seconds
- Reverted regression in testing coverage when deleting a non-existent key
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12917
Test Plan:
Below command repro-ed frequently before the fix and not after.
```
./db_stress --WAL_size_limit_MB=1 --WAL_ttl_seconds=60 --acquire_snapshot_one_in=0 --adaptive_readahead=0 --adm_policy=1 --advise_random_on_open=1 --allow_concurrent_memtable_write=0 --allow_data_in_errors=True --allow_fallocate=0 --allow_setting_blob_options_dynamically=1 --async_io=0 --auto_readahead_size=1 --avoid_flush_during_recovery=0 --avoid_flush_during_shutdown=0 --avoid_unnecessary_blocking_io=0 --backup_max_size=104857600 --backup_one_in=100000 --batch_protection_bytes_per_key=0 --bgerror_resume_retry_interval=100 --blob_cache_size=8388608 --blob_compaction_readahead_size=1048576 --blob_compression_type=none --blob_file_size=1073741824 --blob_file_starting_level=1 --blob_garbage_collection_age_cutoff=0.0 --blob_garbage_collection_force_threshold=0.75 --block_align=0 --block_protection_bytes_per_key=8 --block_size=16384 --bloom_before_level=2147483647 --bloom_bits=16.216959977115277 --bottommost_compression_type=xpress --bottommost_file_compaction_delay=600 --bytes_per_sync=262144 --cache_index_and_filter_blocks=1 --cache_index_and_filter_blocks_with_high_priority=1 --cache_size=8388608 --cache_type=lru_cache --charge_compression_dictionary_building_buffer=1 --charge_file_metadata=0 --charge_filter_construction=0 --charge_table_reader=1 --check_multiget_consistency=0 --check_multiget_entity_consistency=0 --checkpoint_one_in=1000000 --checksum_type=kXXH3 --clear_column_family_one_in=0 --column_families=1 --compact_files_one_in=1000 --compact_range_one_in=0 --compaction_pri=3 --compaction_readahead_size=0 --compaction_ttl=10 --compress_format_version=2 --compressed_secondary_cache_size=8388608 --compression_checksum=0 --compression_max_dict_buffer_bytes=2097151 --compression_max_dict_bytes=16384 --compression_parallel_threads=1 --compression_type=zlib --compression_use_zstd_dict_trainer=0 --compression_zstd_max_train_bytes=0 --continuous_verification_interval=0 --daily_offpeak_time_utc=04:00-08:00 --data_block_index_type=0 --db=/dev/shm/rocksdb_test/rocksdb_crashtest_whitebox --db_write_buffer_size=0 --default_temperature=kUnknown --default_write_temperature=kWarm --delete_obsolete_files_period_micros=21600000000 --delpercent=0 --delrangepercent=5 --destroy_db_initially=0 --detect_filter_construct_corruption=1 --disable_file_deletions_one_in=10000 --disable_manual_compaction_one_in=1000000 --disable_wal=0 --dump_malloc_stats=0 --enable_blob_files=0 --enable_blob_garbage_collection=1 --enable_checksum_handoff=1 --enable_compaction_filter=1 --enable_custom_split_merge=1 --enable_do_not_compress_roles=0 --enable_index_compression=1 --enable_memtable_insert_with_hint_prefix_extractor=0 --enable_pipelined_write=1 --enable_sst_partitioner_factory=1 --enable_thread_tracking=0 --enable_write_thread_adaptive_yield=0 --error_recovery_with_no_fault_injection=1 --exclude_wal_from_write_fault_injection=1 --expected_values_dir=/dev/shm/rocksdb_test/rocksdb_crashtest_expected --fail_if_options_file_error=0 --fifo_allow_compaction=1 --file_checksum_impl=big --fill_cache=1 --flush_one_in=1000000 --format_version=2 --get_all_column_family_metadata_one_in=10000 --get_current_wal_file_one_in=0 --get_live_files_apis_one_in=1000000 --get_properties_of_all_tables_one_in=100000 --get_property_one_in=100000 --get_sorted_wal_files_one_in=0 --hard_pending_compaction_bytes_limit=2097152 --high_pri_pool_ratio=0.5 --index_block_restart_interval=1 --index_shortening=2 --index_type=0 --ingest_external_file_one_in=1000 --initial_auto_readahead_size=0 --inplace_update_support=0 --iterpercent=0 --key_len_percent_dist=1,30,69 --key_may_exist_one_in=100 --last_level_temperature=kUnknown --level_compaction_dynamic_level_bytes=0 --lock_wal_one_in=10000 --log2_keys_per_lock=10 --log_file_time_to_roll=0 --log_readahead_size=0 --long_running_snapshots=1 --low_pri_pool_ratio=0.5 --lowest_used_cache_tier=1 --manifest_preallocation_size=0 --manual_wal_flush_one_in=0 --mark_for_compaction_one_file_in=10 --max_auto_readahead_size=16384 --max_background_compactions=1 --max_bytes_for_level_base=67108864 --max_key=100000 --max_key_len=3 --max_log_file_size=1048576 --max_manifest_file_size=1073741824 --max_sequential_skip_in_iterations=16 --max_total_wal_size=0 --max_write_batch_group_size_bytes=16 --max_write_buffer_number=10 --max_write_buffer_size_to_maintain=8388608 --memtable_insert_hint_per_batch=1 --memtable_max_range_deletions=1000 --memtable_prefix_bloom_size_ratio=0.001 --memtable_protection_bytes_per_key=4 --memtable_whole_key_filtering=1 --memtablerep=skip_list --metadata_charge_policy=1 --metadata_read_fault_one_in=0 --metadata_write_fault_one_in=0 --min_blob_size=16 --min_write_buffer_number_to_merge=2 --mmap_read=0 --mock_direct_io=False --nooverwritepercent=1 --num_file_reads_for_auto_readahead=0 --open_files=-1 --open_metadata_read_fault_one_in=0 --open_metadata_write_fault_one_in=0 --open_read_fault_one_in=0 --open_write_fault_one_in=0 --ops_per_thread=20000000 --optimize_filters_for_hits=1 --optimize_filters_for_memory=0 --optimize_multiget_for_io=1 --paranoid_file_checks=1 --partition_filters=0 --partition_pinning=1 --pause_background_one_in=10000 --periodic_compaction_seconds=10 --prefix_size=8 --prefixpercent=0 --prepopulate_blob_cache=1 --prepopulate_block_cache=1 --preserve_internal_time_seconds=0 --progress_reports=0 --promote_l0_one_in=0 --read_amp_bytes_per_bit=0 --read_fault_one_in=0 --readahead_size=524288 --readpercent=60 --recycle_log_file_num=1 --reopen=20 --report_bg_io_stats=0 --reset_stats_one_in=1000000 --sample_for_compression=5 --secondary_cache_fault_one_in=0 --secondary_cache_uri= --skip_stats_update_on_db_open=1 --snapshot_hold_ops=100000 --soft_pending_compaction_bytes_limit=68719476736 --sqfc_name=foo --sqfc_version=1 --sst_file_manager_bytes_per_sec=0 --sst_file_manager_bytes_per_truncate=0 --stats_dump_period_sec=10 --stats_history_buffer_size=1048576 --strict_bytes_per_sync=1 --subcompactions=2 --sync=0 --sync_fault_injection=0 --table_cache_numshardbits=0 --target_file_size_base=16777216 --target_file_size_multiplier=1 --test_batches_snapshots=0 --top_level_index_pinning=3 --uncache_aggressiveness=118 --universal_max_read_amp=-1 --unpartitioned_pinning=0 --use_adaptive_mutex=0 --use_adaptive_mutex_lru=1 --use_attribute_group=0 --use_blob_cache=0 --use_delta_encoding=1 --use_direct_io_for_flush_and_compaction=0 --use_direct_reads=0 --use_full_merge_v1=0 --use_get_entity=0 --use_merge=0 --use_multi_cf_iterator=0 --use_multi_get_entity=0 --use_multiget=1 --use_put_entity_one_in=0 --use_shared_block_and_blob_cache=1 --use_sqfc_for_range_queries=1 --use_timed_put_one_in=0 --use_write_buffer_manager=0 --user_timestamp_size=0 --value_size_mult=32 --verification_only=0 --verify_checksum=1 --verify_checksum_one_in=1000000 --verify_compression=0 --verify_db_one_in=10000 --verify_file_checksums_one_in=1000000 --verify_iterator_with_expected_state_one_in=5 --verify_sst_unique_id_in_manifest=1 --wal_bytes_per_sync=0 --wal_compression=none --write_buffer_size=33554432 --write_dbid_to_manifest=0 --write_fault_one_in=0 --writepercent=35
```
Reviewed By: cbi42
Differential Revision: D60890580
Pulled By: hx235
fbshipit-source-id: 401f90d6d351c7ee11088cad06fb00e54062d416
2024-08-09 21:51:36 +00:00
|
|
|
UpdateIfInitialWriteFails(db_stress_env, s, &initial_write_s,
|
|
|
|
&initial_wal_write_may_succeed,
|
|
|
|
&wait_for_recover_start_time);
|
2024-07-29 20:51:49 +00:00
|
|
|
} while (!s.ok() && IsErrorInjectedAndRetryable(s) &&
|
|
|
|
initial_wal_write_may_succeed);
|
|
|
|
|
2019-12-09 07:49:32 +00:00
|
|
|
if (!s.ok()) {
|
2024-01-16 21:28:51 +00:00
|
|
|
for (PendingExpectedValue& pending_expected_value :
|
|
|
|
pending_expected_values) {
|
|
|
|
pending_expected_value.Rollback();
|
|
|
|
}
|
2024-06-25 03:51:39 +00:00
|
|
|
if (IsErrorInjectedAndRetryable(s)) {
|
2024-07-29 20:51:49 +00:00
|
|
|
assert(!initial_wal_write_may_succeed);
|
Inject more errors to more files in stress test (#12713)
Summary:
**Context:**
We currently have partial error injection:
- DB operation: all read, SST write
- DB open: all read, SST write, all metadata write.
This PR completes the error injection (with some limitations below):
- DB operation & open: all read, all write, all metadata write, all metadata read
**Summary:**
- Inject retryable metadata read, metadata write error concerning directory (e.g, dir sync, ) or file metadata (e.g, name, size, file creation/deletion...)
- Inject retryable errors to all major file types: random access file, sequential file, writable file
- Allow db stress test operations to handle above injected errors gracefully without crashing
- Change all error injection to thread-local implementation for easier disabling and enabling in the same thread. For example, we can control error handling thread to have no error injection. It's also cleaner in code.
- Limitation: compared to before, we now don't have write fault injection for backup/restore CopyOrCreateFiles work threads since they use anonymous background threads as well as read injection for db open bg thread
- Add a new flag to test error recovery without error injection so we can test the path where error recovery actually succeeds
- Some Refactory & fix to db stress test framework (see PR review comments)
- Fix some minor bugs surfaced (see PR review comments)
- Limitation: had to disable backup restore with metadata read/write injection since it surfaces too many testing issues. Will add it back later to focus on surfacing actual code/internal bugs first.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12713
Test Plan:
- Existing UT
- CI with no trivial error failure
Reviewed By: pdillinger
Differential Revision: D58326608
Pulled By: hx235
fbshipit-source-id: 011b5195aaeb6011641ae0a9194f7f2a0e325ad7
2024-06-19 15:42:00 +00:00
|
|
|
return s;
|
|
|
|
} else if (FLAGS_inject_error_severity == 2) {
|
2021-07-01 21:15:49 +00:00
|
|
|
if (!is_db_stopped_ && s.severity() >= Status::Severity::kFatalError) {
|
|
|
|
is_db_stopped_ = true;
|
|
|
|
} else if (!is_db_stopped_ ||
|
|
|
|
s.severity() < Status::Severity::kFatalError) {
|
|
|
|
fprintf(stderr, "delete range error: %s\n", s.ToString().c_str());
|
Allow TryAgain in db_stress with optimistic txn, and refactoring (#11653)
Summary:
In rare cases, optimistic transaction commit returns TryAgain. This change tolerates that intentional behavior in db_stress, up to a small limit in a row. This way, we don't miss a possible regression with excessive TryAgain, and trying again (rolling back the transaction) should have a well renewed chance of success as the writes will be associated with fresh sequence numbers.
Also, some of the APIs were not clear about Transaction semantics, so I have clarified:
* (Best I can tell....) Destroying a Transaction is safe without calling Rollback() (or at least should be). I don't know why it's a common pattern in our test code and examples to rollback before unconditional destruction. Stress test updated not to call Rollback unnecessarily (to test safe destruction).
* Despite essentially doing what is asked, simply trying Commit() again when it returns TryAgain does not have a chance of success, because of the transaction being bound to the DB state at the time of operations before Commit. Similar logic applies to Busy AFAIK. Commit() API comments updated, and expanded unit test in optimistic_transaction_test.
Also also, because I can't stop myself, I refactored a good portion of the transaction handling code in db_stress.
* Avoid existing and new copy-paste for most transaction interactions with a new ExecuteTransaction (higher-order) function.
* Use unique_ptr (nicely complements removing unnecessary Rollbacks)
* Abstract out a pattern for safely calling std::terminate() and use it in more places. (The TryAgain errors we saw did not have stack traces because of "terminate called recursively".)
Intended follow-up: resurrect use of `FLAGS_rollback_one_in` but also include non-trivial cases
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11653
Test Plan:
this is the test :)
Also, temporarily bypassed the new retry logic and boosted the chance of hitting TryAgain. Quickly reproduced the TryAgain error. Then re-enabled the new retry logic, and was not able to hit the error after running for tens of minutes, even with the boosted chances.
Reviewed By: cbi42
Differential Revision: D47882995
Pulled By: pdillinger
fbshipit-source-id: 21eadb1525423340dbf28d17cf166b9583311a0d
2023-07-28 23:25:29 +00:00
|
|
|
thread->shared->SafeTerminate();
|
2021-07-01 21:15:49 +00:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
fprintf(stderr, "delete range error: %s\n", s.ToString().c_str());
|
Allow TryAgain in db_stress with optimistic txn, and refactoring (#11653)
Summary:
In rare cases, optimistic transaction commit returns TryAgain. This change tolerates that intentional behavior in db_stress, up to a small limit in a row. This way, we don't miss a possible regression with excessive TryAgain, and trying again (rolling back the transaction) should have a well renewed chance of success as the writes will be associated with fresh sequence numbers.
Also, some of the APIs were not clear about Transaction semantics, so I have clarified:
* (Best I can tell....) Destroying a Transaction is safe without calling Rollback() (or at least should be). I don't know why it's a common pattern in our test code and examples to rollback before unconditional destruction. Stress test updated not to call Rollback unnecessarily (to test safe destruction).
* Despite essentially doing what is asked, simply trying Commit() again when it returns TryAgain does not have a chance of success, because of the transaction being bound to the DB state at the time of operations before Commit. Similar logic applies to Busy AFAIK. Commit() API comments updated, and expanded unit test in optimistic_transaction_test.
Also also, because I can't stop myself, I refactored a good portion of the transaction handling code in db_stress.
* Avoid existing and new copy-paste for most transaction interactions with a new ExecuteTransaction (higher-order) function.
* Use unique_ptr (nicely complements removing unnecessary Rollbacks)
* Abstract out a pattern for safely calling std::terminate() and use it in more places. (The TryAgain errors we saw did not have stack traces because of "terminate called recursively".)
Intended follow-up: resurrect use of `FLAGS_rollback_one_in` but also include non-trivial cases
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11653
Test Plan:
this is the test :)
Also, temporarily bypassed the new retry logic and boosted the chance of hitting TryAgain. Quickly reproduced the TryAgain error. Then re-enabled the new retry logic, and was not able to hit the error after running for tens of minutes, even with the boosted chances.
Reviewed By: cbi42
Differential Revision: D47882995
Pulled By: pdillinger
fbshipit-source-id: 21eadb1525423340dbf28d17cf166b9583311a0d
2023-07-28 23:25:29 +00:00
|
|
|
thread->shared->SafeTerminate();
|
2021-07-01 21:15:49 +00:00
|
|
|
}
|
2024-07-29 20:51:49 +00:00
|
|
|
} else {
|
Fix false-positive TestBackupRestore corruption (#12917)
Summary:
**Context:**
https://github.com/facebook/rocksdb/pull/12838 allows a write thread encountered certain injected error to release the lock and sleep before retrying write in order to reduce performance cost. This requires adding checks like [this](https://github.com/facebook/rocksdb/blob/b26b395e0a15255d322be08110db551976188745/db_stress_tool/expected_value.cc#L29-L31) to prevent writing to the same key from another thread.
The added check causes a false-positive failure when delete range + file ingestion + backup is used. Consider the following scenario:
(1) Issue a delete range covering some key that do not exist and a key does exist (named as k1). k1 will have "pending delete" state while the keys that does not exit will have whatever state they already have since we don't delete a key that does not exist already.
(2) After https://github.com/facebook/rocksdb/pull/12838, `PrepareDeleteRange(... &prepared)` will return `prepared = false`. So below logic will be executed and k1's "pending delete" won't get roll-backed nor committed.
```
std::vector<PendingExpectedValue> pending_expected_values =
shared->PrepareDeleteRange(rand_column_family, rand_key,
rand_key + FLAGS_range_deletion_width,
&prepared);
if (!prepared) {
for (PendingExpectedValue& pending_expected_value :
pending_expected_values) {
pending_expected_value.PermitUnclosedPendingState();
}
return s;
}
```
(3) Issue an file ingestion covering k1 and another key k2. Similar to (2), we will have `shared->PreparePut(column_family, key, &prepared)` return `prepared = false` for k1 while k2 will have a "pending put" state. So below logic will be executed and k2's "pending put" state won't get roll-backed nor committed.
```
for (int64_t key = key_base;
s.ok() && key < shared->GetMaxKey() &&
static_cast<int32_t>(keys.size()) < FLAGS_ingest_external_file_width;
++key)
PendingExpectedValue pending_expected_value =
shared->PreparePut(column_family, key, &prepared);
if (!prepared) {
pending_expected_value.PermitUnclosedPendingState();
for (PendingExpectedValue& pev : pending_expected_values) {
pev.PermitUnclosedPendingState();
}
return;
}
}
```
(4) Issue a backup and verify on k2. Below logic decides that k2 should exist in restored DB since it has a pending write state while k2 is never ingested into the original DB as (3) returns early.
```
bool Exists() const { return PendingPut() || !IsDeleted(); }
TestBackupRestore() {
...
Status get_status = restored_db->Get(
read_opts, restored_cf_handles[rand_column_families[i]], key,
&restored_value);
bool exists = thread->shared->Exists(rand_column_families[i], rand_keys[0]);
if (get_status.ok()) {
if (!exists && from_latest && ShouldAcquireMutexOnKey()) {
std::ostringstream oss;
oss << "0x" << key.ToString(true)
<< " exists in restore but not in original db";
s = Status::Corruption(oss.str());
}
} else if (get_status.IsNotFound()) {
if (exists && from_latest && ShouldAcquireMutexOnKey()) {
std::ostringstream oss;
oss << "0x" << key.ToString(true)
<< " exists in original db but not in restore";
s = Status::Corruption(oss.str());
}
}
...
}
```
So we see false-positive corruption like `Failure in a backup/restore operation with: Corruption: 0x000000000000017B0000000000000073787878 exists in original db but not in restore`
A simple fix is to remove `PendingPut()` from `bool Exists() ` since it's called under a lock and should never see a pending write. However, in order for "under a lock and should never see a pending write" to be true, we need to remove the logic of releasing the lock during sleep in the write thread, which expose pending write to other thread that can call Exists() like back up thread.
The downside of holding lock during sleep is blocking other write thread of the same key to proceed cuz they need to wait for the lock. This should happen rarely as the key of a thread is selected randomly in crash test like below.
```
void StressTest::OperateDb(ThreadState* thread) {
for (uint64_t i = 0; i < ops_per_open; i++) {
...
int64_t rand_key = GenerateOneKey(thread, i);
...
}
}
```
**Summary:**
- Removed the "lock release" part and related checks
- Printed recovery time if the write thread waited more than 10 seconds
- Reverted regression in testing coverage when deleting a non-existent key
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12917
Test Plan:
Below command repro-ed frequently before the fix and not after.
```
./db_stress --WAL_size_limit_MB=1 --WAL_ttl_seconds=60 --acquire_snapshot_one_in=0 --adaptive_readahead=0 --adm_policy=1 --advise_random_on_open=1 --allow_concurrent_memtable_write=0 --allow_data_in_errors=True --allow_fallocate=0 --allow_setting_blob_options_dynamically=1 --async_io=0 --auto_readahead_size=1 --avoid_flush_during_recovery=0 --avoid_flush_during_shutdown=0 --avoid_unnecessary_blocking_io=0 --backup_max_size=104857600 --backup_one_in=100000 --batch_protection_bytes_per_key=0 --bgerror_resume_retry_interval=100 --blob_cache_size=8388608 --blob_compaction_readahead_size=1048576 --blob_compression_type=none --blob_file_size=1073741824 --blob_file_starting_level=1 --blob_garbage_collection_age_cutoff=0.0 --blob_garbage_collection_force_threshold=0.75 --block_align=0 --block_protection_bytes_per_key=8 --block_size=16384 --bloom_before_level=2147483647 --bloom_bits=16.216959977115277 --bottommost_compression_type=xpress --bottommost_file_compaction_delay=600 --bytes_per_sync=262144 --cache_index_and_filter_blocks=1 --cache_index_and_filter_blocks_with_high_priority=1 --cache_size=8388608 --cache_type=lru_cache --charge_compression_dictionary_building_buffer=1 --charge_file_metadata=0 --charge_filter_construction=0 --charge_table_reader=1 --check_multiget_consistency=0 --check_multiget_entity_consistency=0 --checkpoint_one_in=1000000 --checksum_type=kXXH3 --clear_column_family_one_in=0 --column_families=1 --compact_files_one_in=1000 --compact_range_one_in=0 --compaction_pri=3 --compaction_readahead_size=0 --compaction_ttl=10 --compress_format_version=2 --compressed_secondary_cache_size=8388608 --compression_checksum=0 --compression_max_dict_buffer_bytes=2097151 --compression_max_dict_bytes=16384 --compression_parallel_threads=1 --compression_type=zlib --compression_use_zstd_dict_trainer=0 --compression_zstd_max_train_bytes=0 --continuous_verification_interval=0 --daily_offpeak_time_utc=04:00-08:00 --data_block_index_type=0 --db=/dev/shm/rocksdb_test/rocksdb_crashtest_whitebox --db_write_buffer_size=0 --default_temperature=kUnknown --default_write_temperature=kWarm --delete_obsolete_files_period_micros=21600000000 --delpercent=0 --delrangepercent=5 --destroy_db_initially=0 --detect_filter_construct_corruption=1 --disable_file_deletions_one_in=10000 --disable_manual_compaction_one_in=1000000 --disable_wal=0 --dump_malloc_stats=0 --enable_blob_files=0 --enable_blob_garbage_collection=1 --enable_checksum_handoff=1 --enable_compaction_filter=1 --enable_custom_split_merge=1 --enable_do_not_compress_roles=0 --enable_index_compression=1 --enable_memtable_insert_with_hint_prefix_extractor=0 --enable_pipelined_write=1 --enable_sst_partitioner_factory=1 --enable_thread_tracking=0 --enable_write_thread_adaptive_yield=0 --error_recovery_with_no_fault_injection=1 --exclude_wal_from_write_fault_injection=1 --expected_values_dir=/dev/shm/rocksdb_test/rocksdb_crashtest_expected --fail_if_options_file_error=0 --fifo_allow_compaction=1 --file_checksum_impl=big --fill_cache=1 --flush_one_in=1000000 --format_version=2 --get_all_column_family_metadata_one_in=10000 --get_current_wal_file_one_in=0 --get_live_files_apis_one_in=1000000 --get_properties_of_all_tables_one_in=100000 --get_property_one_in=100000 --get_sorted_wal_files_one_in=0 --hard_pending_compaction_bytes_limit=2097152 --high_pri_pool_ratio=0.5 --index_block_restart_interval=1 --index_shortening=2 --index_type=0 --ingest_external_file_one_in=1000 --initial_auto_readahead_size=0 --inplace_update_support=0 --iterpercent=0 --key_len_percent_dist=1,30,69 --key_may_exist_one_in=100 --last_level_temperature=kUnknown --level_compaction_dynamic_level_bytes=0 --lock_wal_one_in=10000 --log2_keys_per_lock=10 --log_file_time_to_roll=0 --log_readahead_size=0 --long_running_snapshots=1 --low_pri_pool_ratio=0.5 --lowest_used_cache_tier=1 --manifest_preallocation_size=0 --manual_wal_flush_one_in=0 --mark_for_compaction_one_file_in=10 --max_auto_readahead_size=16384 --max_background_compactions=1 --max_bytes_for_level_base=67108864 --max_key=100000 --max_key_len=3 --max_log_file_size=1048576 --max_manifest_file_size=1073741824 --max_sequential_skip_in_iterations=16 --max_total_wal_size=0 --max_write_batch_group_size_bytes=16 --max_write_buffer_number=10 --max_write_buffer_size_to_maintain=8388608 --memtable_insert_hint_per_batch=1 --memtable_max_range_deletions=1000 --memtable_prefix_bloom_size_ratio=0.001 --memtable_protection_bytes_per_key=4 --memtable_whole_key_filtering=1 --memtablerep=skip_list --metadata_charge_policy=1 --metadata_read_fault_one_in=0 --metadata_write_fault_one_in=0 --min_blob_size=16 --min_write_buffer_number_to_merge=2 --mmap_read=0 --mock_direct_io=False --nooverwritepercent=1 --num_file_reads_for_auto_readahead=0 --open_files=-1 --open_metadata_read_fault_one_in=0 --open_metadata_write_fault_one_in=0 --open_read_fault_one_in=0 --open_write_fault_one_in=0 --ops_per_thread=20000000 --optimize_filters_for_hits=1 --optimize_filters_for_memory=0 --optimize_multiget_for_io=1 --paranoid_file_checks=1 --partition_filters=0 --partition_pinning=1 --pause_background_one_in=10000 --periodic_compaction_seconds=10 --prefix_size=8 --prefixpercent=0 --prepopulate_blob_cache=1 --prepopulate_block_cache=1 --preserve_internal_time_seconds=0 --progress_reports=0 --promote_l0_one_in=0 --read_amp_bytes_per_bit=0 --read_fault_one_in=0 --readahead_size=524288 --readpercent=60 --recycle_log_file_num=1 --reopen=20 --report_bg_io_stats=0 --reset_stats_one_in=1000000 --sample_for_compression=5 --secondary_cache_fault_one_in=0 --secondary_cache_uri= --skip_stats_update_on_db_open=1 --snapshot_hold_ops=100000 --soft_pending_compaction_bytes_limit=68719476736 --sqfc_name=foo --sqfc_version=1 --sst_file_manager_bytes_per_sec=0 --sst_file_manager_bytes_per_truncate=0 --stats_dump_period_sec=10 --stats_history_buffer_size=1048576 --strict_bytes_per_sync=1 --subcompactions=2 --sync=0 --sync_fault_injection=0 --table_cache_numshardbits=0 --target_file_size_base=16777216 --target_file_size_multiplier=1 --test_batches_snapshots=0 --top_level_index_pinning=3 --uncache_aggressiveness=118 --universal_max_read_amp=-1 --unpartitioned_pinning=0 --use_adaptive_mutex=0 --use_adaptive_mutex_lru=1 --use_attribute_group=0 --use_blob_cache=0 --use_delta_encoding=1 --use_direct_io_for_flush_and_compaction=0 --use_direct_reads=0 --use_full_merge_v1=0 --use_get_entity=0 --use_merge=0 --use_multi_cf_iterator=0 --use_multi_get_entity=0 --use_multiget=1 --use_put_entity_one_in=0 --use_shared_block_and_blob_cache=1 --use_sqfc_for_range_queries=1 --use_timed_put_one_in=0 --use_write_buffer_manager=0 --user_timestamp_size=0 --value_size_mult=32 --verification_only=0 --verify_checksum=1 --verify_checksum_one_in=1000000 --verify_compression=0 --verify_db_one_in=10000 --verify_file_checksums_one_in=1000000 --verify_iterator_with_expected_state_one_in=5 --verify_sst_unique_id_in_manifest=1 --wal_bytes_per_sync=0 --wal_compression=none --write_buffer_size=33554432 --write_dbid_to_manifest=0 --write_fault_one_in=0 --writepercent=35
```
Reviewed By: cbi42
Differential Revision: D60890580
Pulled By: hx235
fbshipit-source-id: 401f90d6d351c7ee11088cad06fb00e54062d416
2024-08-09 21:51:36 +00:00
|
|
|
PrintWriteRecoveryWaitTimeIfNeeded(
|
|
|
|
db_stress_env, initial_write_s, initial_wal_write_may_succeed,
|
|
|
|
wait_for_recover_start_time, "TestDeleteRange");
|
2024-07-29 20:51:49 +00:00
|
|
|
for (PendingExpectedValue& pending_expected_value :
|
|
|
|
pending_expected_values) {
|
|
|
|
pending_expected_value.Commit();
|
|
|
|
}
|
|
|
|
thread->stats.AddRangeDeletions(1);
|
|
|
|
thread->stats.AddCoveredByRangeDeletions(covered);
|
2019-12-09 07:49:32 +00:00
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2019-12-09 22:36:10 +00:00
|
|
|
void TestIngestExternalFile(ThreadState* thread,
|
|
|
|
const std::vector<int>& rand_column_families,
|
2022-09-15 22:55:37 +00:00
|
|
|
const std::vector<int64_t>& rand_keys) override {
|
Optimize compaction for standalone range deletion files (#13078)
Summary:
This PR adds some optimization for compacting standalone range deletion files. A standalone range deletion file is one with just a single range deletion. Currently, such a file is used in bulk loading to achieve something like atomically delete old version of all data with one big range deletion and adding new version of data. These are the changes included in the PR:
1) When a standalone range deletion file is ingested via bulk loading, it's marked for compaction.
2) When picking input files during compaction picking, we attempt to only pick a standalone range deletion file when oldest snapshot is at or above the file's seqno. To do this, `PickCompaction` API is updated to take existing snapshots as an input. This is only done for the universal compaction + UDT disabled combination, we save querying for existing snapshots and not pass it for all other cases.
3) At `Compaction` construction time, the input files will be filtered to examine if any of them can be skipped for compaction iterator. For example, if all the data of the file is deleted by a standalone range tombstone, and the oldest snapshot is at or above such range tombstone, this file will be filtered out.
4) Every time a snapshot is released, we examine if any column family has standalone range deletion files that becomes eligible to be scheduled for compaction. And schedule one for it.
Potential future improvements:
- Add some dedicated statistics for the filtered files.
- Extend this input filtering to L0 files' compactions cases when a newer L0 file could shadow an older L0 file
Pull Request resolved: https://github.com/facebook/rocksdb/pull/13078
Test Plan: Added unit tests and stress tested a few rounds
Reviewed By: cbi42
Differential Revision: D64879415
Pulled By: jowlyzhang
fbshipit-source-id: 02b8683fddbe11f093bcaa0a38406deb39f44d9e
2024-10-25 16:32:14 +00:00
|
|
|
// When true, we create two sst files, the first one with regular puts for
|
|
|
|
// a continuous range of keys, the second one with a standalone range
|
|
|
|
// deletion for all the keys. This is to exercise the standalone range
|
|
|
|
// deletion file's compaction input optimization.
|
2024-11-02 00:07:34 +00:00
|
|
|
bool test_standalone_range_deletion = thread->rand.OneInOpt(
|
|
|
|
FLAGS_test_ingest_standalone_range_deletion_one_in);
|
Optimize compaction for standalone range deletion files (#13078)
Summary:
This PR adds some optimization for compacting standalone range deletion files. A standalone range deletion file is one with just a single range deletion. Currently, such a file is used in bulk loading to achieve something like atomically delete old version of all data with one big range deletion and adding new version of data. These are the changes included in the PR:
1) When a standalone range deletion file is ingested via bulk loading, it's marked for compaction.
2) When picking input files during compaction picking, we attempt to only pick a standalone range deletion file when oldest snapshot is at or above the file's seqno. To do this, `PickCompaction` API is updated to take existing snapshots as an input. This is only done for the universal compaction + UDT disabled combination, we save querying for existing snapshots and not pass it for all other cases.
3) At `Compaction` construction time, the input files will be filtered to examine if any of them can be skipped for compaction iterator. For example, if all the data of the file is deleted by a standalone range tombstone, and the oldest snapshot is at or above such range tombstone, this file will be filtered out.
4) Every time a snapshot is released, we examine if any column family has standalone range deletion files that becomes eligible to be scheduled for compaction. And schedule one for it.
Potential future improvements:
- Add some dedicated statistics for the filtered files.
- Extend this input filtering to L0 files' compactions cases when a newer L0 file could shadow an older L0 file
Pull Request resolved: https://github.com/facebook/rocksdb/pull/13078
Test Plan: Added unit tests and stress tested a few rounds
Reviewed By: cbi42
Differential Revision: D64879415
Pulled By: jowlyzhang
fbshipit-source-id: 02b8683fddbe11f093bcaa0a38406deb39f44d9e
2024-10-25 16:32:14 +00:00
|
|
|
std::vector<std::string> external_files;
|
2019-12-09 07:49:32 +00:00
|
|
|
const std::string sst_filename =
|
2022-05-06 20:03:58 +00:00
|
|
|
FLAGS_db + "/." + std::to_string(thread->tid) + ".sst";
|
Optimize compaction for standalone range deletion files (#13078)
Summary:
This PR adds some optimization for compacting standalone range deletion files. A standalone range deletion file is one with just a single range deletion. Currently, such a file is used in bulk loading to achieve something like atomically delete old version of all data with one big range deletion and adding new version of data. These are the changes included in the PR:
1) When a standalone range deletion file is ingested via bulk loading, it's marked for compaction.
2) When picking input files during compaction picking, we attempt to only pick a standalone range deletion file when oldest snapshot is at or above the file's seqno. To do this, `PickCompaction` API is updated to take existing snapshots as an input. This is only done for the universal compaction + UDT disabled combination, we save querying for existing snapshots and not pass it for all other cases.
3) At `Compaction` construction time, the input files will be filtered to examine if any of them can be skipped for compaction iterator. For example, if all the data of the file is deleted by a standalone range tombstone, and the oldest snapshot is at or above such range tombstone, this file will be filtered out.
4) Every time a snapshot is released, we examine if any column family has standalone range deletion files that becomes eligible to be scheduled for compaction. And schedule one for it.
Potential future improvements:
- Add some dedicated statistics for the filtered files.
- Extend this input filtering to L0 files' compactions cases when a newer L0 file could shadow an older L0 file
Pull Request resolved: https://github.com/facebook/rocksdb/pull/13078
Test Plan: Added unit tests and stress tested a few rounds
Reviewed By: cbi42
Differential Revision: D64879415
Pulled By: jowlyzhang
fbshipit-source-id: 02b8683fddbe11f093bcaa0a38406deb39f44d9e
2024-10-25 16:32:14 +00:00
|
|
|
external_files.push_back(sst_filename);
|
|
|
|
std::string standalone_rangedel_filename;
|
|
|
|
if (test_standalone_range_deletion) {
|
|
|
|
standalone_rangedel_filename = FLAGS_db + "/." +
|
|
|
|
std::to_string(thread->tid) +
|
|
|
|
"_standalone_rangedel.sst";
|
|
|
|
external_files.push_back(standalone_rangedel_filename);
|
|
|
|
}
|
2019-12-09 07:49:32 +00:00
|
|
|
Status s;
|
Add missing db crash options (#12414)
Summary:
**Context/Summary:**
We are doing a sweep in all public options, including but not limited to the `Options`, `Read/WriteOptions`, `IngestExternalFileOptions`, cache options.., to find and add the uncovered ones into db crash. The options included in this PR require minimum changes to db crash other than adding the options themselves.
A bonus change: to surface new issues by improved coverage in stderror, we decided to fail/terminate crash test for manual compactions (CompactFiles, CompactRange()) on meaningful errors. See https://github.com/facebook/rocksdb/pull/12414/files#diff-5c4ced6afb6a90e27fec18ab03b2cd89e8f99db87791b4ecc6fa2694284d50c0R2528-R2532, https://github.com/facebook/rocksdb/pull/12414/files#diff-5c4ced6afb6a90e27fec18ab03b2cd89e8f99db87791b4ecc6fa2694284d50c0R2330-R2336 for more.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12414
Test Plan:
- Run `python3 ./tools/db_crashtest.py --simple blackbox` for 10 minutes to ensure no trivial failure
- Run `python3 tools/db_crashtest.py --simple blackbox --compact_files_one_in=1 --compact_range_one_in=1 --read_fault_one_in=1 --write_fault_one_in=1 --interval=50` for a while to ensure the bonus change does not result in trivial crash/termination of stress test
Reviewed By: ajkr, jowlyzhang, cbi42
Differential Revision: D54691774
Pulled By: hx235
fbshipit-source-id: 50443dfb6aaabd8e24c79a2e42b68c6de877be88
2024-03-13 00:24:12 +00:00
|
|
|
std::ostringstream ingest_options_oss;
|
Inject more errors to more files in stress test (#12713)
Summary:
**Context:**
We currently have partial error injection:
- DB operation: all read, SST write
- DB open: all read, SST write, all metadata write.
This PR completes the error injection (with some limitations below):
- DB operation & open: all read, all write, all metadata write, all metadata read
**Summary:**
- Inject retryable metadata read, metadata write error concerning directory (e.g, dir sync, ) or file metadata (e.g, name, size, file creation/deletion...)
- Inject retryable errors to all major file types: random access file, sequential file, writable file
- Allow db stress test operations to handle above injected errors gracefully without crashing
- Change all error injection to thread-local implementation for easier disabling and enabling in the same thread. For example, we can control error handling thread to have no error injection. It's also cleaner in code.
- Limitation: compared to before, we now don't have write fault injection for backup/restore CopyOrCreateFiles work threads since they use anonymous background threads as well as read injection for db open bg thread
- Add a new flag to test error recovery without error injection so we can test the path where error recovery actually succeeds
- Some Refactory & fix to db stress test framework (see PR review comments)
- Fix some minor bugs surfaced (see PR review comments)
- Limitation: had to disable backup restore with metadata read/write injection since it surfaces too many testing issues. Will add it back later to focus on surfacing actual code/internal bugs first.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12713
Test Plan:
- Existing UT
- CI with no trivial error failure
Reviewed By: pdillinger
Differential Revision: D58326608
Pulled By: hx235
fbshipit-source-id: 011b5195aaeb6011641ae0a9194f7f2a0e325ad7
2024-06-19 15:42:00 +00:00
|
|
|
|
|
|
|
// Temporarily disable error injection for preparation
|
Fix nullptr access and race to fault_fs_guard (#12799)
Summary:
**Context/Summary:**
There are a couple places where we forgot to check fault_fs_guard before accessing it. So we can see something like this occasionally
```
=138831==Hint: address points to the zero page.
SCARINESS: 10 (null-deref)
AddressSanitizer:DEADLYSIGNAL
#0 0x18b9e0b in rocksdb::ThreadLocalPtr::Get() const fbcode/internal_repo_rocksdb/repo/util/thread_local.cc:503
https://github.com/facebook/rocksdb/issues/1 0x83d8b7 in rocksdb::StressTest::TestCompactRange(rocksdb::ThreadState*, long, rocksdb::Slice const&, rocksdb::ColumnFamilyHandle*) fbcode/internal_repo_rocksdb/repo/utilities/fault_injection_fs.h
```
Also accessing of `io_activties_exempted_from_fault_injection.find` not fully synced so we see the following
```
WARNING: ThreadSanitizer: data race (pid=90939)
Write of size 8 at 0x7b4c000004d0 by thread T762 (mutexes: write M0):
#0 std::_Rb_tree<rocksdb::Env::IOActivity, rocksdb::Env::IOActivity, std::_Identity<rocksdb::Env::IOActivity>, std::less<rocksdb::Env::IOActivity>, std::allocator<rocksdb::Env::IOActivity>>::operator=(std::_Rb_tree<rocksdb::Env::IOActivity, rocksdb::Env::IOActivity, std::_Identity<rocksdb::Env::IOActivity>, std::less<rocksdb::Env::IOActivity>, std::allocator<rocksdb::Env::IOActivity>> const&) fbcode/third-party-buck/platform010/build/libgcc/include/c++/trunk/bits/stl_tree.h:208 (db_stress+0x411c32) (BuildId: b803e5aca22c6b080defed8e85b7bfec)
https://github.com/facebook/rocksdb/issues/1 rocksdb::DbStressListener::OnErrorRecoveryCompleted(rocksdb::Status) fbcode/third-party-buck/platform010/build/libgcc/include/c++/trunk/bits/stl_set.h:298 (db_stress+0x4112e5) (BuildId: b803e5aca22c6b080defed8e85b7bfec)
https://github.com/facebook/rocksdb/issues/2 rocksdb::EventHelpers::NotifyOnErrorRecoveryEnd(std::vector<std::shared_ptr<rocksdb::EventListener>, std::allocator<std::shared_ptr<rocksdb::EventListener>>> const&, rocksdb::Status const&, rocksdb::Status const&, rocksdb::InstrumentedMutex*) fbcode/internal_repo_rocksdb/repo/db/event_helpers.cc:239 (db_stress+0xa09d60) (BuildId: b803e5aca22c6b080defed8e85b7bfec)
Previous read of size 8 at 0x7b4c000004d0 by thread T131 (mutexes: write M1):
#0 rocksdb::FaultInjectionTestFS::MaybeInjectThreadLocalError(rocksdb::FaultInjectionIOType, rocksdb::IOOptions const&, rocksdb::FaultInjectionTestFS::ErrorOperation, rocksdb::Slice*, bool, char*, bool, bool*) fbcode/third-party-buck/platform010/build/libgcc/include/c++/trunk/bits/stl_tree.h:798 (db_stress+0xf7d0f3) (BuildId: b803e5aca22c6b080defed8e85b7bfec)
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12799
Test Plan: CI
Reviewed By: jowlyzhang
Differential Revision: D58917449
Pulled By: hx235
fbshipit-source-id: f24fc1acc2a7d91f9f285447a97ba41397f48dbd
2024-06-24 23:10:36 +00:00
|
|
|
if (fault_fs_guard) {
|
|
|
|
fault_fs_guard->DisableThreadLocalErrorInjection(
|
|
|
|
FaultInjectionIOType::kMetadataRead);
|
|
|
|
fault_fs_guard->DisableThreadLocalErrorInjection(
|
|
|
|
FaultInjectionIOType::kMetadataWrite);
|
|
|
|
}
|
|
|
|
|
Optimize compaction for standalone range deletion files (#13078)
Summary:
This PR adds some optimization for compacting standalone range deletion files. A standalone range deletion file is one with just a single range deletion. Currently, such a file is used in bulk loading to achieve something like atomically delete old version of all data with one big range deletion and adding new version of data. These are the changes included in the PR:
1) When a standalone range deletion file is ingested via bulk loading, it's marked for compaction.
2) When picking input files during compaction picking, we attempt to only pick a standalone range deletion file when oldest snapshot is at or above the file's seqno. To do this, `PickCompaction` API is updated to take existing snapshots as an input. This is only done for the universal compaction + UDT disabled combination, we save querying for existing snapshots and not pass it for all other cases.
3) At `Compaction` construction time, the input files will be filtered to examine if any of them can be skipped for compaction iterator. For example, if all the data of the file is deleted by a standalone range tombstone, and the oldest snapshot is at or above such range tombstone, this file will be filtered out.
4) Every time a snapshot is released, we examine if any column family has standalone range deletion files that becomes eligible to be scheduled for compaction. And schedule one for it.
Potential future improvements:
- Add some dedicated statistics for the filtered files.
- Extend this input filtering to L0 files' compactions cases when a newer L0 file could shadow an older L0 file
Pull Request resolved: https://github.com/facebook/rocksdb/pull/13078
Test Plan: Added unit tests and stress tested a few rounds
Reviewed By: cbi42
Differential Revision: D64879415
Pulled By: jowlyzhang
fbshipit-source-id: 02b8683fddbe11f093bcaa0a38406deb39f44d9e
2024-10-25 16:32:14 +00:00
|
|
|
for (const auto& filename : external_files) {
|
|
|
|
if (db_stress_env->FileExists(filename).ok()) {
|
|
|
|
// Maybe we terminated abnormally before, so cleanup to give this file
|
|
|
|
// ingestion a clean slate
|
|
|
|
s = db_stress_env->DeleteFile(filename);
|
|
|
|
}
|
|
|
|
if (!s.ok()) {
|
|
|
|
return;
|
|
|
|
}
|
2019-12-09 07:49:32 +00:00
|
|
|
}
|
2024-07-29 20:51:49 +00:00
|
|
|
|
Fix nullptr access and race to fault_fs_guard (#12799)
Summary:
**Context/Summary:**
There are a couple places where we forgot to check fault_fs_guard before accessing it. So we can see something like this occasionally
```
=138831==Hint: address points to the zero page.
SCARINESS: 10 (null-deref)
AddressSanitizer:DEADLYSIGNAL
#0 0x18b9e0b in rocksdb::ThreadLocalPtr::Get() const fbcode/internal_repo_rocksdb/repo/util/thread_local.cc:503
https://github.com/facebook/rocksdb/issues/1 0x83d8b7 in rocksdb::StressTest::TestCompactRange(rocksdb::ThreadState*, long, rocksdb::Slice const&, rocksdb::ColumnFamilyHandle*) fbcode/internal_repo_rocksdb/repo/utilities/fault_injection_fs.h
```
Also accessing of `io_activties_exempted_from_fault_injection.find` not fully synced so we see the following
```
WARNING: ThreadSanitizer: data race (pid=90939)
Write of size 8 at 0x7b4c000004d0 by thread T762 (mutexes: write M0):
#0 std::_Rb_tree<rocksdb::Env::IOActivity, rocksdb::Env::IOActivity, std::_Identity<rocksdb::Env::IOActivity>, std::less<rocksdb::Env::IOActivity>, std::allocator<rocksdb::Env::IOActivity>>::operator=(std::_Rb_tree<rocksdb::Env::IOActivity, rocksdb::Env::IOActivity, std::_Identity<rocksdb::Env::IOActivity>, std::less<rocksdb::Env::IOActivity>, std::allocator<rocksdb::Env::IOActivity>> const&) fbcode/third-party-buck/platform010/build/libgcc/include/c++/trunk/bits/stl_tree.h:208 (db_stress+0x411c32) (BuildId: b803e5aca22c6b080defed8e85b7bfec)
https://github.com/facebook/rocksdb/issues/1 rocksdb::DbStressListener::OnErrorRecoveryCompleted(rocksdb::Status) fbcode/third-party-buck/platform010/build/libgcc/include/c++/trunk/bits/stl_set.h:298 (db_stress+0x4112e5) (BuildId: b803e5aca22c6b080defed8e85b7bfec)
https://github.com/facebook/rocksdb/issues/2 rocksdb::EventHelpers::NotifyOnErrorRecoveryEnd(std::vector<std::shared_ptr<rocksdb::EventListener>, std::allocator<std::shared_ptr<rocksdb::EventListener>>> const&, rocksdb::Status const&, rocksdb::Status const&, rocksdb::InstrumentedMutex*) fbcode/internal_repo_rocksdb/repo/db/event_helpers.cc:239 (db_stress+0xa09d60) (BuildId: b803e5aca22c6b080defed8e85b7bfec)
Previous read of size 8 at 0x7b4c000004d0 by thread T131 (mutexes: write M1):
#0 rocksdb::FaultInjectionTestFS::MaybeInjectThreadLocalError(rocksdb::FaultInjectionIOType, rocksdb::IOOptions const&, rocksdb::FaultInjectionTestFS::ErrorOperation, rocksdb::Slice*, bool, char*, bool, bool*) fbcode/third-party-buck/platform010/build/libgcc/include/c++/trunk/bits/stl_tree.h:798 (db_stress+0xf7d0f3) (BuildId: b803e5aca22c6b080defed8e85b7bfec)
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12799
Test Plan: CI
Reviewed By: jowlyzhang
Differential Revision: D58917449
Pulled By: hx235
fbshipit-source-id: f24fc1acc2a7d91f9f285447a97ba41397f48dbd
2024-06-24 23:10:36 +00:00
|
|
|
if (fault_fs_guard) {
|
|
|
|
fault_fs_guard->EnableThreadLocalErrorInjection(
|
|
|
|
FaultInjectionIOType::kMetadataRead);
|
|
|
|
fault_fs_guard->EnableThreadLocalErrorInjection(
|
|
|
|
FaultInjectionIOType::kMetadataWrite);
|
|
|
|
}
|
2019-12-09 07:49:32 +00:00
|
|
|
|
|
|
|
SstFileWriter sst_file_writer(EnvOptions(options_), options_);
|
Optimize compaction for standalone range deletion files (#13078)
Summary:
This PR adds some optimization for compacting standalone range deletion files. A standalone range deletion file is one with just a single range deletion. Currently, such a file is used in bulk loading to achieve something like atomically delete old version of all data with one big range deletion and adding new version of data. These are the changes included in the PR:
1) When a standalone range deletion file is ingested via bulk loading, it's marked for compaction.
2) When picking input files during compaction picking, we attempt to only pick a standalone range deletion file when oldest snapshot is at or above the file's seqno. To do this, `PickCompaction` API is updated to take existing snapshots as an input. This is only done for the universal compaction + UDT disabled combination, we save querying for existing snapshots and not pass it for all other cases.
3) At `Compaction` construction time, the input files will be filtered to examine if any of them can be skipped for compaction iterator. For example, if all the data of the file is deleted by a standalone range tombstone, and the oldest snapshot is at or above such range tombstone, this file will be filtered out.
4) Every time a snapshot is released, we examine if any column family has standalone range deletion files that becomes eligible to be scheduled for compaction. And schedule one for it.
Potential future improvements:
- Add some dedicated statistics for the filtered files.
- Extend this input filtering to L0 files' compactions cases when a newer L0 file could shadow an older L0 file
Pull Request resolved: https://github.com/facebook/rocksdb/pull/13078
Test Plan: Added unit tests and stress tested a few rounds
Reviewed By: cbi42
Differential Revision: D64879415
Pulled By: jowlyzhang
fbshipit-source-id: 02b8683fddbe11f093bcaa0a38406deb39f44d9e
2024-10-25 16:32:14 +00:00
|
|
|
SstFileWriter standalone_rangedel_sst_file_writer(EnvOptions(options_),
|
|
|
|
options_);
|
2019-12-09 07:49:32 +00:00
|
|
|
if (s.ok()) {
|
|
|
|
s = sst_file_writer.Open(sst_filename);
|
|
|
|
}
|
Optimize compaction for standalone range deletion files (#13078)
Summary:
This PR adds some optimization for compacting standalone range deletion files. A standalone range deletion file is one with just a single range deletion. Currently, such a file is used in bulk loading to achieve something like atomically delete old version of all data with one big range deletion and adding new version of data. These are the changes included in the PR:
1) When a standalone range deletion file is ingested via bulk loading, it's marked for compaction.
2) When picking input files during compaction picking, we attempt to only pick a standalone range deletion file when oldest snapshot is at or above the file's seqno. To do this, `PickCompaction` API is updated to take existing snapshots as an input. This is only done for the universal compaction + UDT disabled combination, we save querying for existing snapshots and not pass it for all other cases.
3) At `Compaction` construction time, the input files will be filtered to examine if any of them can be skipped for compaction iterator. For example, if all the data of the file is deleted by a standalone range tombstone, and the oldest snapshot is at or above such range tombstone, this file will be filtered out.
4) Every time a snapshot is released, we examine if any column family has standalone range deletion files that becomes eligible to be scheduled for compaction. And schedule one for it.
Potential future improvements:
- Add some dedicated statistics for the filtered files.
- Extend this input filtering to L0 files' compactions cases when a newer L0 file could shadow an older L0 file
Pull Request resolved: https://github.com/facebook/rocksdb/pull/13078
Test Plan: Added unit tests and stress tested a few rounds
Reviewed By: cbi42
Differential Revision: D64879415
Pulled By: jowlyzhang
fbshipit-source-id: 02b8683fddbe11f093bcaa0a38406deb39f44d9e
2024-10-25 16:32:14 +00:00
|
|
|
if (s.ok() && test_standalone_range_deletion) {
|
|
|
|
s = standalone_rangedel_sst_file_writer.Open(
|
|
|
|
standalone_rangedel_filename);
|
|
|
|
}
|
|
|
|
if (!s.ok()) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2019-12-09 07:49:32 +00:00
|
|
|
int64_t key_base = rand_keys[0];
|
|
|
|
int column_family = rand_column_families[0];
|
|
|
|
std::vector<std::unique_ptr<MutexLock>> range_locks;
|
2022-05-26 17:31:37 +00:00
|
|
|
range_locks.reserve(FLAGS_ingest_external_file_width);
|
|
|
|
std::vector<int64_t> keys;
|
|
|
|
keys.reserve(FLAGS_ingest_external_file_width);
|
2019-12-09 07:49:32 +00:00
|
|
|
std::vector<uint32_t> values;
|
2022-05-26 17:31:37 +00:00
|
|
|
values.reserve(FLAGS_ingest_external_file_width);
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
std::vector<PendingExpectedValue> pending_expected_values;
|
|
|
|
pending_expected_values.reserve(FLAGS_ingest_external_file_width);
|
2019-12-09 07:49:32 +00:00
|
|
|
SharedState* shared = thread->shared;
|
|
|
|
|
Optimize compaction for standalone range deletion files (#13078)
Summary:
This PR adds some optimization for compacting standalone range deletion files. A standalone range deletion file is one with just a single range deletion. Currently, such a file is used in bulk loading to achieve something like atomically delete old version of all data with one big range deletion and adding new version of data. These are the changes included in the PR:
1) When a standalone range deletion file is ingested via bulk loading, it's marked for compaction.
2) When picking input files during compaction picking, we attempt to only pick a standalone range deletion file when oldest snapshot is at or above the file's seqno. To do this, `PickCompaction` API is updated to take existing snapshots as an input. This is only done for the universal compaction + UDT disabled combination, we save querying for existing snapshots and not pass it for all other cases.
3) At `Compaction` construction time, the input files will be filtered to examine if any of them can be skipped for compaction iterator. For example, if all the data of the file is deleted by a standalone range tombstone, and the oldest snapshot is at or above such range tombstone, this file will be filtered out.
4) Every time a snapshot is released, we examine if any column family has standalone range deletion files that becomes eligible to be scheduled for compaction. And schedule one for it.
Potential future improvements:
- Add some dedicated statistics for the filtered files.
- Extend this input filtering to L0 files' compactions cases when a newer L0 file could shadow an older L0 file
Pull Request resolved: https://github.com/facebook/rocksdb/pull/13078
Test Plan: Added unit tests and stress tested a few rounds
Reviewed By: cbi42
Differential Revision: D64879415
Pulled By: jowlyzhang
fbshipit-source-id: 02b8683fddbe11f093bcaa0a38406deb39f44d9e
2024-10-25 16:32:14 +00:00
|
|
|
// Grab locks, add keys
|
2022-05-26 17:31:37 +00:00
|
|
|
assert(FLAGS_nooverwritepercent < 100);
|
2019-12-09 07:49:32 +00:00
|
|
|
for (int64_t key = key_base;
|
Optimize compaction for standalone range deletion files (#13078)
Summary:
This PR adds some optimization for compacting standalone range deletion files. A standalone range deletion file is one with just a single range deletion. Currently, such a file is used in bulk loading to achieve something like atomically delete old version of all data with one big range deletion and adding new version of data. These are the changes included in the PR:
1) When a standalone range deletion file is ingested via bulk loading, it's marked for compaction.
2) When picking input files during compaction picking, we attempt to only pick a standalone range deletion file when oldest snapshot is at or above the file's seqno. To do this, `PickCompaction` API is updated to take existing snapshots as an input. This is only done for the universal compaction + UDT disabled combination, we save querying for existing snapshots and not pass it for all other cases.
3) At `Compaction` construction time, the input files will be filtered to examine if any of them can be skipped for compaction iterator. For example, if all the data of the file is deleted by a standalone range tombstone, and the oldest snapshot is at or above such range tombstone, this file will be filtered out.
4) Every time a snapshot is released, we examine if any column family has standalone range deletion files that becomes eligible to be scheduled for compaction. And schedule one for it.
Potential future improvements:
- Add some dedicated statistics for the filtered files.
- Extend this input filtering to L0 files' compactions cases when a newer L0 file could shadow an older L0 file
Pull Request resolved: https://github.com/facebook/rocksdb/pull/13078
Test Plan: Added unit tests and stress tested a few rounds
Reviewed By: cbi42
Differential Revision: D64879415
Pulled By: jowlyzhang
fbshipit-source-id: 02b8683fddbe11f093bcaa0a38406deb39f44d9e
2024-10-25 16:32:14 +00:00
|
|
|
key < shared->GetMaxKey() &&
|
|
|
|
key < key_base + FLAGS_ingest_external_file_width;
|
2019-12-09 07:49:32 +00:00
|
|
|
++key) {
|
2022-09-15 22:55:37 +00:00
|
|
|
if (key == key_base ||
|
|
|
|
(key & ((1 << FLAGS_log2_keys_per_lock) - 1)) == 0) {
|
2019-12-09 07:49:32 +00:00
|
|
|
range_locks.emplace_back(
|
|
|
|
new MutexLock(shared->GetMutexForKey(column_family, key)));
|
|
|
|
}
|
Optimize compaction for standalone range deletion files (#13078)
Summary:
This PR adds some optimization for compacting standalone range deletion files. A standalone range deletion file is one with just a single range deletion. Currently, such a file is used in bulk loading to achieve something like atomically delete old version of all data with one big range deletion and adding new version of data. These are the changes included in the PR:
1) When a standalone range deletion file is ingested via bulk loading, it's marked for compaction.
2) When picking input files during compaction picking, we attempt to only pick a standalone range deletion file when oldest snapshot is at or above the file's seqno. To do this, `PickCompaction` API is updated to take existing snapshots as an input. This is only done for the universal compaction + UDT disabled combination, we save querying for existing snapshots and not pass it for all other cases.
3) At `Compaction` construction time, the input files will be filtered to examine if any of them can be skipped for compaction iterator. For example, if all the data of the file is deleted by a standalone range tombstone, and the oldest snapshot is at or above such range tombstone, this file will be filtered out.
4) Every time a snapshot is released, we examine if any column family has standalone range deletion files that becomes eligible to be scheduled for compaction. And schedule one for it.
Potential future improvements:
- Add some dedicated statistics for the filtered files.
- Extend this input filtering to L0 files' compactions cases when a newer L0 file could shadow an older L0 file
Pull Request resolved: https://github.com/facebook/rocksdb/pull/13078
Test Plan: Added unit tests and stress tested a few rounds
Reviewed By: cbi42
Differential Revision: D64879415
Pulled By: jowlyzhang
fbshipit-source-id: 02b8683fddbe11f093bcaa0a38406deb39f44d9e
2024-10-25 16:32:14 +00:00
|
|
|
if (test_standalone_range_deletion) {
|
|
|
|
// Testing standalone range deletion needs a continuous range of keys.
|
|
|
|
if (shared->AllowsOverwrite(key)) {
|
|
|
|
if (keys.empty() || (!keys.empty() && keys.back() == key - 1)) {
|
|
|
|
keys.push_back(key);
|
|
|
|
} else {
|
|
|
|
keys.clear();
|
|
|
|
keys.push_back(key);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
if (keys.size() > 0) {
|
|
|
|
break;
|
|
|
|
} else {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
if (!shared->AllowsOverwrite(key)) {
|
|
|
|
// We could alternatively include `key` that is deleted.
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
keys.push_back(key);
|
2022-05-26 17:31:37 +00:00
|
|
|
}
|
Optimize compaction for standalone range deletion files (#13078)
Summary:
This PR adds some optimization for compacting standalone range deletion files. A standalone range deletion file is one with just a single range deletion. Currently, such a file is used in bulk loading to achieve something like atomically delete old version of all data with one big range deletion and adding new version of data. These are the changes included in the PR:
1) When a standalone range deletion file is ingested via bulk loading, it's marked for compaction.
2) When picking input files during compaction picking, we attempt to only pick a standalone range deletion file when oldest snapshot is at or above the file's seqno. To do this, `PickCompaction` API is updated to take existing snapshots as an input. This is only done for the universal compaction + UDT disabled combination, we save querying for existing snapshots and not pass it for all other cases.
3) At `Compaction` construction time, the input files will be filtered to examine if any of them can be skipped for compaction iterator. For example, if all the data of the file is deleted by a standalone range tombstone, and the oldest snapshot is at or above such range tombstone, this file will be filtered out.
4) Every time a snapshot is released, we examine if any column family has standalone range deletion files that becomes eligible to be scheduled for compaction. And schedule one for it.
Potential future improvements:
- Add some dedicated statistics for the filtered files.
- Extend this input filtering to L0 files' compactions cases when a newer L0 file could shadow an older L0 file
Pull Request resolved: https://github.com/facebook/rocksdb/pull/13078
Test Plan: Added unit tests and stress tested a few rounds
Reviewed By: cbi42
Differential Revision: D64879415
Pulled By: jowlyzhang
fbshipit-source-id: 02b8683fddbe11f093bcaa0a38406deb39f44d9e
2024-10-25 16:32:14 +00:00
|
|
|
}
|
2024-07-29 20:51:49 +00:00
|
|
|
|
Optimize compaction for standalone range deletion files (#13078)
Summary:
This PR adds some optimization for compacting standalone range deletion files. A standalone range deletion file is one with just a single range deletion. Currently, such a file is used in bulk loading to achieve something like atomically delete old version of all data with one big range deletion and adding new version of data. These are the changes included in the PR:
1) When a standalone range deletion file is ingested via bulk loading, it's marked for compaction.
2) When picking input files during compaction picking, we attempt to only pick a standalone range deletion file when oldest snapshot is at or above the file's seqno. To do this, `PickCompaction` API is updated to take existing snapshots as an input. This is only done for the universal compaction + UDT disabled combination, we save querying for existing snapshots and not pass it for all other cases.
3) At `Compaction` construction time, the input files will be filtered to examine if any of them can be skipped for compaction iterator. For example, if all the data of the file is deleted by a standalone range tombstone, and the oldest snapshot is at or above such range tombstone, this file will be filtered out.
4) Every time a snapshot is released, we examine if any column family has standalone range deletion files that becomes eligible to be scheduled for compaction. And schedule one for it.
Potential future improvements:
- Add some dedicated statistics for the filtered files.
- Extend this input filtering to L0 files' compactions cases when a newer L0 file could shadow an older L0 file
Pull Request resolved: https://github.com/facebook/rocksdb/pull/13078
Test Plan: Added unit tests and stress tested a few rounds
Reviewed By: cbi42
Differential Revision: D64879415
Pulled By: jowlyzhang
fbshipit-source-id: 02b8683fddbe11f093bcaa0a38406deb39f44d9e
2024-10-25 16:32:14 +00:00
|
|
|
if (s.ok() && keys.empty()) {
|
|
|
|
return;
|
|
|
|
}
|
2019-12-09 07:49:32 +00:00
|
|
|
|
Optimize compaction for standalone range deletion files (#13078)
Summary:
This PR adds some optimization for compacting standalone range deletion files. A standalone range deletion file is one with just a single range deletion. Currently, such a file is used in bulk loading to achieve something like atomically delete old version of all data with one big range deletion and adding new version of data. These are the changes included in the PR:
1) When a standalone range deletion file is ingested via bulk loading, it's marked for compaction.
2) When picking input files during compaction picking, we attempt to only pick a standalone range deletion file when oldest snapshot is at or above the file's seqno. To do this, `PickCompaction` API is updated to take existing snapshots as an input. This is only done for the universal compaction + UDT disabled combination, we save querying for existing snapshots and not pass it for all other cases.
3) At `Compaction` construction time, the input files will be filtered to examine if any of them can be skipped for compaction iterator. For example, if all the data of the file is deleted by a standalone range tombstone, and the oldest snapshot is at or above such range tombstone, this file will be filtered out.
4) Every time a snapshot is released, we examine if any column family has standalone range deletion files that becomes eligible to be scheduled for compaction. And schedule one for it.
Potential future improvements:
- Add some dedicated statistics for the filtered files.
- Extend this input filtering to L0 files' compactions cases when a newer L0 file could shadow an older L0 file
Pull Request resolved: https://github.com/facebook/rocksdb/pull/13078
Test Plan: Added unit tests and stress tested a few rounds
Reviewed By: cbi42
Differential Revision: D64879415
Pulled By: jowlyzhang
fbshipit-source-id: 02b8683fddbe11f093bcaa0a38406deb39f44d9e
2024-10-25 16:32:14 +00:00
|
|
|
// set pending state on expected values, create and ingest files.
|
|
|
|
size_t total_keys = keys.size();
|
|
|
|
for (size_t i = 0; s.ok() && i < total_keys; i++) {
|
|
|
|
int64_t key = keys.at(i);
|
2019-12-09 07:49:32 +00:00
|
|
|
char value[100];
|
|
|
|
auto key_str = Key(key);
|
2023-08-15 23:13:13 +00:00
|
|
|
const Slice k(key_str);
|
Optimize compaction for standalone range deletion files (#13078)
Summary:
This PR adds some optimization for compacting standalone range deletion files. A standalone range deletion file is one with just a single range deletion. Currently, such a file is used in bulk loading to achieve something like atomically delete old version of all data with one big range deletion and adding new version of data. These are the changes included in the PR:
1) When a standalone range deletion file is ingested via bulk loading, it's marked for compaction.
2) When picking input files during compaction picking, we attempt to only pick a standalone range deletion file when oldest snapshot is at or above the file's seqno. To do this, `PickCompaction` API is updated to take existing snapshots as an input. This is only done for the universal compaction + UDT disabled combination, we save querying for existing snapshots and not pass it for all other cases.
3) At `Compaction` construction time, the input files will be filtered to examine if any of them can be skipped for compaction iterator. For example, if all the data of the file is deleted by a standalone range tombstone, and the oldest snapshot is at or above such range tombstone, this file will be filtered out.
4) Every time a snapshot is released, we examine if any column family has standalone range deletion files that becomes eligible to be scheduled for compaction. And schedule one for it.
Potential future improvements:
- Add some dedicated statistics for the filtered files.
- Extend this input filtering to L0 files' compactions cases when a newer L0 file could shadow an older L0 file
Pull Request resolved: https://github.com/facebook/rocksdb/pull/13078
Test Plan: Added unit tests and stress tested a few rounds
Reviewed By: cbi42
Differential Revision: D64879415
Pulled By: jowlyzhang
fbshipit-source-id: 02b8683fddbe11f093bcaa0a38406deb39f44d9e
2024-10-25 16:32:14 +00:00
|
|
|
Slice v;
|
|
|
|
if (test_standalone_range_deletion) {
|
|
|
|
assert(i == 0 || keys.at(i - 1) == key - 1);
|
2023-08-15 23:13:13 +00:00
|
|
|
s = sst_file_writer.Put(k, v);
|
Optimize compaction for standalone range deletion files (#13078)
Summary:
This PR adds some optimization for compacting standalone range deletion files. A standalone range deletion file is one with just a single range deletion. Currently, such a file is used in bulk loading to achieve something like atomically delete old version of all data with one big range deletion and adding new version of data. These are the changes included in the PR:
1) When a standalone range deletion file is ingested via bulk loading, it's marked for compaction.
2) When picking input files during compaction picking, we attempt to only pick a standalone range deletion file when oldest snapshot is at or above the file's seqno. To do this, `PickCompaction` API is updated to take existing snapshots as an input. This is only done for the universal compaction + UDT disabled combination, we save querying for existing snapshots and not pass it for all other cases.
3) At `Compaction` construction time, the input files will be filtered to examine if any of them can be skipped for compaction iterator. For example, if all the data of the file is deleted by a standalone range tombstone, and the oldest snapshot is at or above such range tombstone, this file will be filtered out.
4) Every time a snapshot is released, we examine if any column family has standalone range deletion files that becomes eligible to be scheduled for compaction. And schedule one for it.
Potential future improvements:
- Add some dedicated statistics for the filtered files.
- Extend this input filtering to L0 files' compactions cases when a newer L0 file could shadow an older L0 file
Pull Request resolved: https://github.com/facebook/rocksdb/pull/13078
Test Plan: Added unit tests and stress tested a few rounds
Reviewed By: cbi42
Differential Revision: D64879415
Pulled By: jowlyzhang
fbshipit-source-id: 02b8683fddbe11f093bcaa0a38406deb39f44d9e
2024-10-25 16:32:14 +00:00
|
|
|
} else {
|
|
|
|
PendingExpectedValue pending_expected_value =
|
|
|
|
shared->PreparePut(column_family, key);
|
|
|
|
const uint32_t value_base = pending_expected_value.GetFinalValueBase();
|
|
|
|
const size_t value_len =
|
|
|
|
GenerateValue(value_base, value, sizeof(value));
|
|
|
|
v = Slice(value, value_len);
|
|
|
|
values.push_back(value_base);
|
|
|
|
pending_expected_values.push_back(pending_expected_value);
|
|
|
|
if (FLAGS_use_put_entity_one_in > 0 &&
|
|
|
|
(value_base % FLAGS_use_put_entity_one_in) == 0) {
|
|
|
|
WideColumns columns = GenerateWideColumns(values.back(), v);
|
|
|
|
s = sst_file_writer.PutEntity(k, columns);
|
|
|
|
} else {
|
|
|
|
s = sst_file_writer.Put(k, v);
|
|
|
|
}
|
2023-08-15 23:13:13 +00:00
|
|
|
}
|
2019-12-09 07:49:32 +00:00
|
|
|
}
|
Optimize compaction for standalone range deletion files (#13078)
Summary:
This PR adds some optimization for compacting standalone range deletion files. A standalone range deletion file is one with just a single range deletion. Currently, such a file is used in bulk loading to achieve something like atomically delete old version of all data with one big range deletion and adding new version of data. These are the changes included in the PR:
1) When a standalone range deletion file is ingested via bulk loading, it's marked for compaction.
2) When picking input files during compaction picking, we attempt to only pick a standalone range deletion file when oldest snapshot is at or above the file's seqno. To do this, `PickCompaction` API is updated to take existing snapshots as an input. This is only done for the universal compaction + UDT disabled combination, we save querying for existing snapshots and not pass it for all other cases.
3) At `Compaction` construction time, the input files will be filtered to examine if any of them can be skipped for compaction iterator. For example, if all the data of the file is deleted by a standalone range tombstone, and the oldest snapshot is at or above such range tombstone, this file will be filtered out.
4) Every time a snapshot is released, we examine if any column family has standalone range deletion files that becomes eligible to be scheduled for compaction. And schedule one for it.
Potential future improvements:
- Add some dedicated statistics for the filtered files.
- Extend this input filtering to L0 files' compactions cases when a newer L0 file could shadow an older L0 file
Pull Request resolved: https://github.com/facebook/rocksdb/pull/13078
Test Plan: Added unit tests and stress tested a few rounds
Reviewed By: cbi42
Differential Revision: D64879415
Pulled By: jowlyzhang
fbshipit-source-id: 02b8683fddbe11f093bcaa0a38406deb39f44d9e
2024-10-25 16:32:14 +00:00
|
|
|
if (s.ok() && !keys.empty()) {
|
|
|
|
s = sst_file_writer.Finish();
|
2022-09-28 23:21:43 +00:00
|
|
|
}
|
|
|
|
|
Optimize compaction for standalone range deletion files (#13078)
Summary:
This PR adds some optimization for compacting standalone range deletion files. A standalone range deletion file is one with just a single range deletion. Currently, such a file is used in bulk loading to achieve something like atomically delete old version of all data with one big range deletion and adding new version of data. These are the changes included in the PR:
1) When a standalone range deletion file is ingested via bulk loading, it's marked for compaction.
2) When picking input files during compaction picking, we attempt to only pick a standalone range deletion file when oldest snapshot is at or above the file's seqno. To do this, `PickCompaction` API is updated to take existing snapshots as an input. This is only done for the universal compaction + UDT disabled combination, we save querying for existing snapshots and not pass it for all other cases.
3) At `Compaction` construction time, the input files will be filtered to examine if any of them can be skipped for compaction iterator. For example, if all the data of the file is deleted by a standalone range tombstone, and the oldest snapshot is at or above such range tombstone, this file will be filtered out.
4) Every time a snapshot is released, we examine if any column family has standalone range deletion files that becomes eligible to be scheduled for compaction. And schedule one for it.
Potential future improvements:
- Add some dedicated statistics for the filtered files.
- Extend this input filtering to L0 files' compactions cases when a newer L0 file could shadow an older L0 file
Pull Request resolved: https://github.com/facebook/rocksdb/pull/13078
Test Plan: Added unit tests and stress tested a few rounds
Reviewed By: cbi42
Differential Revision: D64879415
Pulled By: jowlyzhang
fbshipit-source-id: 02b8683fddbe11f093bcaa0a38406deb39f44d9e
2024-10-25 16:32:14 +00:00
|
|
|
if (s.ok() && total_keys != 0 && test_standalone_range_deletion) {
|
|
|
|
int64_t start_key = keys.at(0);
|
|
|
|
int64_t end_key = keys.back() + 1;
|
|
|
|
pending_expected_values =
|
|
|
|
shared->PrepareDeleteRange(column_family, start_key, end_key);
|
|
|
|
auto start_key_str = Key(start_key);
|
|
|
|
const Slice start_key_slice(start_key_str);
|
|
|
|
auto end_key_str = Key(end_key);
|
|
|
|
const Slice end_key_slice(end_key_str);
|
|
|
|
s = standalone_rangedel_sst_file_writer.DeleteRange(start_key_slice,
|
|
|
|
end_key_slice);
|
|
|
|
if (s.ok()) {
|
|
|
|
s = standalone_rangedel_sst_file_writer.Finish();
|
|
|
|
}
|
2019-12-09 07:49:32 +00:00
|
|
|
}
|
|
|
|
if (s.ok()) {
|
Add missing db crash options (#12414)
Summary:
**Context/Summary:**
We are doing a sweep in all public options, including but not limited to the `Options`, `Read/WriteOptions`, `IngestExternalFileOptions`, cache options.., to find and add the uncovered ones into db crash. The options included in this PR require minimum changes to db crash other than adding the options themselves.
A bonus change: to surface new issues by improved coverage in stderror, we decided to fail/terminate crash test for manual compactions (CompactFiles, CompactRange()) on meaningful errors. See https://github.com/facebook/rocksdb/pull/12414/files#diff-5c4ced6afb6a90e27fec18ab03b2cd89e8f99db87791b4ecc6fa2694284d50c0R2528-R2532, https://github.com/facebook/rocksdb/pull/12414/files#diff-5c4ced6afb6a90e27fec18ab03b2cd89e8f99db87791b4ecc6fa2694284d50c0R2330-R2336 for more.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12414
Test Plan:
- Run `python3 ./tools/db_crashtest.py --simple blackbox` for 10 minutes to ensure no trivial failure
- Run `python3 tools/db_crashtest.py --simple blackbox --compact_files_one_in=1 --compact_range_one_in=1 --read_fault_one_in=1 --write_fault_one_in=1 --interval=50` for a while to ensure the bonus change does not result in trivial crash/termination of stress test
Reviewed By: ajkr, jowlyzhang, cbi42
Differential Revision: D54691774
Pulled By: hx235
fbshipit-source-id: 50443dfb6aaabd8e24c79a2e42b68c6de877be88
2024-03-13 00:24:12 +00:00
|
|
|
IngestExternalFileOptions ingest_options;
|
|
|
|
ingest_options.move_files = thread->rand.OneInOpt(2);
|
|
|
|
ingest_options.verify_checksums_before_ingest = thread->rand.OneInOpt(2);
|
|
|
|
ingest_options.verify_checksums_readahead_size =
|
|
|
|
thread->rand.OneInOpt(2) ? 1024 * 1024 : 0;
|
2024-10-16 21:11:22 +00:00
|
|
|
ingest_options.fill_cache = thread->rand.OneInOpt(4);
|
Add missing db crash options (#12414)
Summary:
**Context/Summary:**
We are doing a sweep in all public options, including but not limited to the `Options`, `Read/WriteOptions`, `IngestExternalFileOptions`, cache options.., to find and add the uncovered ones into db crash. The options included in this PR require minimum changes to db crash other than adding the options themselves.
A bonus change: to surface new issues by improved coverage in stderror, we decided to fail/terminate crash test for manual compactions (CompactFiles, CompactRange()) on meaningful errors. See https://github.com/facebook/rocksdb/pull/12414/files#diff-5c4ced6afb6a90e27fec18ab03b2cd89e8f99db87791b4ecc6fa2694284d50c0R2528-R2532, https://github.com/facebook/rocksdb/pull/12414/files#diff-5c4ced6afb6a90e27fec18ab03b2cd89e8f99db87791b4ecc6fa2694284d50c0R2330-R2336 for more.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12414
Test Plan:
- Run `python3 ./tools/db_crashtest.py --simple blackbox` for 10 minutes to ensure no trivial failure
- Run `python3 tools/db_crashtest.py --simple blackbox --compact_files_one_in=1 --compact_range_one_in=1 --read_fault_one_in=1 --write_fault_one_in=1 --interval=50` for a while to ensure the bonus change does not result in trivial crash/termination of stress test
Reviewed By: ajkr, jowlyzhang, cbi42
Differential Revision: D54691774
Pulled By: hx235
fbshipit-source-id: 50443dfb6aaabd8e24c79a2e42b68c6de877be88
2024-03-13 00:24:12 +00:00
|
|
|
ingest_options_oss << "move_files: " << ingest_options.move_files
|
|
|
|
<< ", verify_checksums_before_ingest: "
|
|
|
|
<< ingest_options.verify_checksums_before_ingest
|
|
|
|
<< ", verify_checksums_readahead_size: "
|
2024-10-16 21:11:22 +00:00
|
|
|
<< ingest_options.verify_checksums_readahead_size
|
Optimize compaction for standalone range deletion files (#13078)
Summary:
This PR adds some optimization for compacting standalone range deletion files. A standalone range deletion file is one with just a single range deletion. Currently, such a file is used in bulk loading to achieve something like atomically delete old version of all data with one big range deletion and adding new version of data. These are the changes included in the PR:
1) When a standalone range deletion file is ingested via bulk loading, it's marked for compaction.
2) When picking input files during compaction picking, we attempt to only pick a standalone range deletion file when oldest snapshot is at or above the file's seqno. To do this, `PickCompaction` API is updated to take existing snapshots as an input. This is only done for the universal compaction + UDT disabled combination, we save querying for existing snapshots and not pass it for all other cases.
3) At `Compaction` construction time, the input files will be filtered to examine if any of them can be skipped for compaction iterator. For example, if all the data of the file is deleted by a standalone range tombstone, and the oldest snapshot is at or above such range tombstone, this file will be filtered out.
4) Every time a snapshot is released, we examine if any column family has standalone range deletion files that becomes eligible to be scheduled for compaction. And schedule one for it.
Potential future improvements:
- Add some dedicated statistics for the filtered files.
- Extend this input filtering to L0 files' compactions cases when a newer L0 file could shadow an older L0 file
Pull Request resolved: https://github.com/facebook/rocksdb/pull/13078
Test Plan: Added unit tests and stress tested a few rounds
Reviewed By: cbi42
Differential Revision: D64879415
Pulled By: jowlyzhang
fbshipit-source-id: 02b8683fddbe11f093bcaa0a38406deb39f44d9e
2024-10-25 16:32:14 +00:00
|
|
|
<< ", fill_cache: " << ingest_options.fill_cache
|
|
|
|
<< ", test_standalone_range_deletion: "
|
|
|
|
<< test_standalone_range_deletion;
|
2019-12-09 07:49:32 +00:00
|
|
|
s = db_->IngestExternalFile(column_families_[column_family],
|
Optimize compaction for standalone range deletion files (#13078)
Summary:
This PR adds some optimization for compacting standalone range deletion files. A standalone range deletion file is one with just a single range deletion. Currently, such a file is used in bulk loading to achieve something like atomically delete old version of all data with one big range deletion and adding new version of data. These are the changes included in the PR:
1) When a standalone range deletion file is ingested via bulk loading, it's marked for compaction.
2) When picking input files during compaction picking, we attempt to only pick a standalone range deletion file when oldest snapshot is at or above the file's seqno. To do this, `PickCompaction` API is updated to take existing snapshots as an input. This is only done for the universal compaction + UDT disabled combination, we save querying for existing snapshots and not pass it for all other cases.
3) At `Compaction` construction time, the input files will be filtered to examine if any of them can be skipped for compaction iterator. For example, if all the data of the file is deleted by a standalone range tombstone, and the oldest snapshot is at or above such range tombstone, this file will be filtered out.
4) Every time a snapshot is released, we examine if any column family has standalone range deletion files that becomes eligible to be scheduled for compaction. And schedule one for it.
Potential future improvements:
- Add some dedicated statistics for the filtered files.
- Extend this input filtering to L0 files' compactions cases when a newer L0 file could shadow an older L0 file
Pull Request resolved: https://github.com/facebook/rocksdb/pull/13078
Test Plan: Added unit tests and stress tested a few rounds
Reviewed By: cbi42
Differential Revision: D64879415
Pulled By: jowlyzhang
fbshipit-source-id: 02b8683fddbe11f093bcaa0a38406deb39f44d9e
2024-10-25 16:32:14 +00:00
|
|
|
external_files, ingest_options);
|
2019-12-09 07:49:32 +00:00
|
|
|
}
|
|
|
|
if (!s.ok()) {
|
2024-01-16 21:28:51 +00:00
|
|
|
for (PendingExpectedValue& pending_expected_value :
|
|
|
|
pending_expected_values) {
|
|
|
|
pending_expected_value.Rollback();
|
|
|
|
}
|
Inject more errors to more files in stress test (#12713)
Summary:
**Context:**
We currently have partial error injection:
- DB operation: all read, SST write
- DB open: all read, SST write, all metadata write.
This PR completes the error injection (with some limitations below):
- DB operation & open: all read, all write, all metadata write, all metadata read
**Summary:**
- Inject retryable metadata read, metadata write error concerning directory (e.g, dir sync, ) or file metadata (e.g, name, size, file creation/deletion...)
- Inject retryable errors to all major file types: random access file, sequential file, writable file
- Allow db stress test operations to handle above injected errors gracefully without crashing
- Change all error injection to thread-local implementation for easier disabling and enabling in the same thread. For example, we can control error handling thread to have no error injection. It's also cleaner in code.
- Limitation: compared to before, we now don't have write fault injection for backup/restore CopyOrCreateFiles work threads since they use anonymous background threads as well as read injection for db open bg thread
- Add a new flag to test error recovery without error injection so we can test the path where error recovery actually succeeds
- Some Refactory & fix to db stress test framework (see PR review comments)
- Fix some minor bugs surfaced (see PR review comments)
- Limitation: had to disable backup restore with metadata read/write injection since it surfaces too many testing issues. Will add it back later to focus on surfacing actual code/internal bugs first.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12713
Test Plan:
- Existing UT
- CI with no trivial error failure
Reviewed By: pdillinger
Differential Revision: D58326608
Pulled By: hx235
fbshipit-source-id: 011b5195aaeb6011641ae0a9194f7f2a0e325ad7
2024-06-19 15:42:00 +00:00
|
|
|
|
2024-06-25 03:51:39 +00:00
|
|
|
if (!IsErrorInjectedAndRetryable(s)) {
|
Add missing db crash options (#12414)
Summary:
**Context/Summary:**
We are doing a sweep in all public options, including but not limited to the `Options`, `Read/WriteOptions`, `IngestExternalFileOptions`, cache options.., to find and add the uncovered ones into db crash. The options included in this PR require minimum changes to db crash other than adding the options themselves.
A bonus change: to surface new issues by improved coverage in stderror, we decided to fail/terminate crash test for manual compactions (CompactFiles, CompactRange()) on meaningful errors. See https://github.com/facebook/rocksdb/pull/12414/files#diff-5c4ced6afb6a90e27fec18ab03b2cd89e8f99db87791b4ecc6fa2694284d50c0R2528-R2532, https://github.com/facebook/rocksdb/pull/12414/files#diff-5c4ced6afb6a90e27fec18ab03b2cd89e8f99db87791b4ecc6fa2694284d50c0R2330-R2336 for more.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12414
Test Plan:
- Run `python3 ./tools/db_crashtest.py --simple blackbox` for 10 minutes to ensure no trivial failure
- Run `python3 tools/db_crashtest.py --simple blackbox --compact_files_one_in=1 --compact_range_one_in=1 --read_fault_one_in=1 --write_fault_one_in=1 --interval=50` for a while to ensure the bonus change does not result in trivial crash/termination of stress test
Reviewed By: ajkr, jowlyzhang, cbi42
Differential Revision: D54691774
Pulled By: hx235
fbshipit-source-id: 50443dfb6aaabd8e24c79a2e42b68c6de877be88
2024-03-13 00:24:12 +00:00
|
|
|
fprintf(stderr,
|
|
|
|
"file ingestion error: %s under specified "
|
|
|
|
"IngestExternalFileOptions: %s (Empty string or "
|
|
|
|
"missing field indicates default option or value is used)\n",
|
|
|
|
s.ToString().c_str(), ingest_options_oss.str().c_str());
|
2023-09-18 23:23:26 +00:00
|
|
|
thread->shared->SafeTerminate();
|
|
|
|
}
|
|
|
|
} else {
|
2024-01-16 21:28:51 +00:00
|
|
|
for (PendingExpectedValue& pending_expected_value :
|
|
|
|
pending_expected_values) {
|
|
|
|
pending_expected_value.Commit();
|
2023-09-18 23:23:26 +00:00
|
|
|
}
|
2019-12-09 07:49:32 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Add Iterator test against expected state to stress test (#10538)
Summary:
As mentioned in https://github.com/facebook/rocksdb/pull/5506#issuecomment-506021913,
`db_stress` does not have much verification for iterator correctness.
It has a `TestIterate()` function, but that is mainly for comparing results
between two iterators, one with `total_order_seek` and the other optionally
sets auto_prefix, upper/lower bounds. Commit 49a0581ad2462e31aa3f768afa769e0d33390f33
added a new `TestIterateAgainstExpected()` function that compares iterator against
expected state. It locks a range of keys, creates an iterator, does
a random sequence of `Next/Prev` and compares against expected state.
This PR is based on that commit, the main changes include some logs
(for easier debugging if a test fails), a forward and backward scan to
cover the entire locked key range, and a flag for optionally turning on
this version of Iterator testing.
Added constraint that the checks against expected state in
`TestIterateAgainstExpected()` and in `TestGet()` are only turned on
when `--skip_verifydb` flag is not set.
Remove the change log introduced in https://github.com/facebook/rocksdb/issues/10553.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10538
Test Plan:
Run `db_stress` with `--verify_iterator_with_expected_state_one_in=1`,
and a large `--iterpercent` and `--num_iterations`. Checked `op_logs`
manually to ensure expected coverage. Tweaked part of the code in
https://github.com/facebook/rocksdb/issues/10449 and stress test was able to catch it.
- internally run various flavor of crash test
Reviewed By: ajkr
Differential Revision: D38847269
Pulled By: cbi42
fbshipit-source-id: 8b4402a9bba9f6cfa08051943cd672579d489599
2022-08-24 21:59:50 +00:00
|
|
|
// Given a key K, this creates an iterator which scans the range
|
|
|
|
// [K, K + FLAGS_num_iterations) forward and backward.
|
|
|
|
// Then does a random sequence of Next/Prev operations.
|
|
|
|
Status TestIterateAgainstExpected(
|
|
|
|
ThreadState* thread, const ReadOptions& read_opts,
|
|
|
|
const std::vector<int>& rand_column_families,
|
2022-09-15 22:55:37 +00:00
|
|
|
const std::vector<int64_t>& rand_keys) override {
|
2022-10-14 21:25:05 +00:00
|
|
|
assert(thread);
|
|
|
|
assert(!rand_column_families.empty());
|
|
|
|
assert(!rand_keys.empty());
|
|
|
|
|
Add Iterator test against expected state to stress test (#10538)
Summary:
As mentioned in https://github.com/facebook/rocksdb/pull/5506#issuecomment-506021913,
`db_stress` does not have much verification for iterator correctness.
It has a `TestIterate()` function, but that is mainly for comparing results
between two iterators, one with `total_order_seek` and the other optionally
sets auto_prefix, upper/lower bounds. Commit 49a0581ad2462e31aa3f768afa769e0d33390f33
added a new `TestIterateAgainstExpected()` function that compares iterator against
expected state. It locks a range of keys, creates an iterator, does
a random sequence of `Next/Prev` and compares against expected state.
This PR is based on that commit, the main changes include some logs
(for easier debugging if a test fails), a forward and backward scan to
cover the entire locked key range, and a flag for optionally turning on
this version of Iterator testing.
Added constraint that the checks against expected state in
`TestIterateAgainstExpected()` and in `TestGet()` are only turned on
when `--skip_verifydb` flag is not set.
Remove the change log introduced in https://github.com/facebook/rocksdb/issues/10553.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10538
Test Plan:
Run `db_stress` with `--verify_iterator_with_expected_state_one_in=1`,
and a large `--iterpercent` and `--num_iterations`. Checked `op_logs`
manually to ensure expected coverage. Tweaked part of the code in
https://github.com/facebook/rocksdb/issues/10449 and stress test was able to catch it.
- internally run various flavor of crash test
Reviewed By: ajkr
Differential Revision: D38847269
Pulled By: cbi42
fbshipit-source-id: 8b4402a9bba9f6cfa08051943cd672579d489599
2022-08-24 21:59:50 +00:00
|
|
|
auto shared = thread->shared;
|
2022-10-14 21:25:05 +00:00
|
|
|
assert(shared);
|
|
|
|
|
Add Iterator test against expected state to stress test (#10538)
Summary:
As mentioned in https://github.com/facebook/rocksdb/pull/5506#issuecomment-506021913,
`db_stress` does not have much verification for iterator correctness.
It has a `TestIterate()` function, but that is mainly for comparing results
between two iterators, one with `total_order_seek` and the other optionally
sets auto_prefix, upper/lower bounds. Commit 49a0581ad2462e31aa3f768afa769e0d33390f33
added a new `TestIterateAgainstExpected()` function that compares iterator against
expected state. It locks a range of keys, creates an iterator, does
a random sequence of `Next/Prev` and compares against expected state.
This PR is based on that commit, the main changes include some logs
(for easier debugging if a test fails), a forward and backward scan to
cover the entire locked key range, and a flag for optionally turning on
this version of Iterator testing.
Added constraint that the checks against expected state in
`TestIterateAgainstExpected()` and in `TestGet()` are only turned on
when `--skip_verifydb` flag is not set.
Remove the change log introduced in https://github.com/facebook/rocksdb/issues/10553.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10538
Test Plan:
Run `db_stress` with `--verify_iterator_with_expected_state_one_in=1`,
and a large `--iterpercent` and `--num_iterations`. Checked `op_logs`
manually to ensure expected coverage. Tweaked part of the code in
https://github.com/facebook/rocksdb/issues/10449 and stress test was able to catch it.
- internally run various flavor of crash test
Reviewed By: ajkr
Differential Revision: D38847269
Pulled By: cbi42
fbshipit-source-id: 8b4402a9bba9f6cfa08051943cd672579d489599
2022-08-24 21:59:50 +00:00
|
|
|
int64_t max_key = shared->GetMaxKey();
|
2022-10-14 21:25:05 +00:00
|
|
|
|
|
|
|
const int64_t num_iter = static_cast<int64_t>(FLAGS_num_iterations);
|
|
|
|
|
|
|
|
int64_t lb = rand_keys[0];
|
|
|
|
if (lb > max_key - num_iter) {
|
|
|
|
lb = thread->rand.Next() % (max_key - num_iter + 1);
|
Add Iterator test against expected state to stress test (#10538)
Summary:
As mentioned in https://github.com/facebook/rocksdb/pull/5506#issuecomment-506021913,
`db_stress` does not have much verification for iterator correctness.
It has a `TestIterate()` function, but that is mainly for comparing results
between two iterators, one with `total_order_seek` and the other optionally
sets auto_prefix, upper/lower bounds. Commit 49a0581ad2462e31aa3f768afa769e0d33390f33
added a new `TestIterateAgainstExpected()` function that compares iterator against
expected state. It locks a range of keys, creates an iterator, does
a random sequence of `Next/Prev` and compares against expected state.
This PR is based on that commit, the main changes include some logs
(for easier debugging if a test fails), a forward and backward scan to
cover the entire locked key range, and a flag for optionally turning on
this version of Iterator testing.
Added constraint that the checks against expected state in
`TestIterateAgainstExpected()` and in `TestGet()` are only turned on
when `--skip_verifydb` flag is not set.
Remove the change log introduced in https://github.com/facebook/rocksdb/issues/10553.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10538
Test Plan:
Run `db_stress` with `--verify_iterator_with_expected_state_one_in=1`,
and a large `--iterpercent` and `--num_iterations`. Checked `op_logs`
manually to ensure expected coverage. Tweaked part of the code in
https://github.com/facebook/rocksdb/issues/10449 and stress test was able to catch it.
- internally run various flavor of crash test
Reviewed By: ajkr
Differential Revision: D38847269
Pulled By: cbi42
fbshipit-source-id: 8b4402a9bba9f6cfa08051943cd672579d489599
2022-08-24 21:59:50 +00:00
|
|
|
}
|
2022-10-14 21:25:05 +00:00
|
|
|
|
|
|
|
const int64_t ub = lb + num_iter;
|
|
|
|
|
|
|
|
const int rand_column_family = rand_column_families[0];
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
|
|
|
|
// Testing parallel read and write to the same key with user timestamp
|
|
|
|
// is not currently supported
|
|
|
|
std::vector<std::unique_ptr<MutexLock>> range_locks;
|
|
|
|
if (FLAGS_user_timestamp_size > 0) {
|
|
|
|
range_locks = shared->GetLocksForKeyRange(rand_column_family, lb, ub);
|
|
|
|
}
|
2022-10-14 21:25:05 +00:00
|
|
|
|
|
|
|
ReadOptions ro(read_opts);
|
2023-08-14 21:57:28 +00:00
|
|
|
if (FLAGS_prefix_size > 0) {
|
|
|
|
ro.total_order_seek = true;
|
|
|
|
}
|
2022-10-14 21:25:05 +00:00
|
|
|
|
Verify Iterator/Get() against expected state in only `no_batched_ops_test` (#10590)
Summary:
https://github.com/facebook/rocksdb/issues/10538 added `TestIterateAgainstExpected()` in `no_batched_ops_test` to verify iterator correctness against the in memory expected state. It is not compatible when run after some other stress tests, e.g. `TestPut()` in `batched_op_stress`, that either do not set expected state when writing to DB or use keys that cannot be parsed by `GetIntVal()`. The assert [here](https://github.com/facebook/rocksdb/blob/d17be55aab80b856f96f4af89f8d18fef96646b4/db_stress_tool/db_stress_common.h#L520) could fail. This PR fixed this issue by setting iterator upperbound to `max_key` when `destroy_db_initially=0` to avoid the key space that `batched_op_stress` touches.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10590
Test Plan:
```
# set up DB with batched_op_stress
./db_stress --test_batches_snapshots=1 --verify_iterator_with_expected_state_one_in=1 --max_key_len=3 --max_key=100000000 --skip_verifydb=1 --continuous_verification_interval=0 --writepercent=85 --delpercent=3 --delrangepercent=0 --iterpercent=10 --nooverwritepercent=1 --prefixpercent=0 --readpercent=2 --key_len_percent_dist=1,30,69
# Before this PR, the following test will fail the asserts with error msg like the following
# Assertion failed: (size_key <= key_gen_ctx.weights.size() * sizeof(uint64_t)), function GetIntVal, file db_stress_common.h, line 524.
./db_stress --verify_iterator_with_expected_state_one_in=1 --max_key_len=3 --max_key=100000000 --skip_verifydb=1 --continuous_verification_interval=0 --writepercent=0 --delpercent=3 --delrangepercent=0 --iterpercent=95 --nooverwritepercent=1 --prefixpercent=0 --readpercent=2 --key_len_percent_dist=1,30,69 --destroy_db_initially=0
```
Reviewed By: ajkr
Differential Revision: D39085243
Pulled By: cbi42
fbshipit-source-id: a7dfee2320c330773b623b442d730fd014ec7056
2022-08-29 16:51:40 +00:00
|
|
|
std::string read_ts_str;
|
|
|
|
Slice read_ts;
|
|
|
|
if (FLAGS_user_timestamp_size > 0) {
|
|
|
|
read_ts_str = GetNowNanos();
|
|
|
|
read_ts = read_ts_str;
|
2022-10-14 21:25:05 +00:00
|
|
|
ro.timestamp = &read_ts;
|
Verify Iterator/Get() against expected state in only `no_batched_ops_test` (#10590)
Summary:
https://github.com/facebook/rocksdb/issues/10538 added `TestIterateAgainstExpected()` in `no_batched_ops_test` to verify iterator correctness against the in memory expected state. It is not compatible when run after some other stress tests, e.g. `TestPut()` in `batched_op_stress`, that either do not set expected state when writing to DB or use keys that cannot be parsed by `GetIntVal()`. The assert [here](https://github.com/facebook/rocksdb/blob/d17be55aab80b856f96f4af89f8d18fef96646b4/db_stress_tool/db_stress_common.h#L520) could fail. This PR fixed this issue by setting iterator upperbound to `max_key` when `destroy_db_initially=0` to avoid the key space that `batched_op_stress` touches.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10590
Test Plan:
```
# set up DB with batched_op_stress
./db_stress --test_batches_snapshots=1 --verify_iterator_with_expected_state_one_in=1 --max_key_len=3 --max_key=100000000 --skip_verifydb=1 --continuous_verification_interval=0 --writepercent=85 --delpercent=3 --delrangepercent=0 --iterpercent=10 --nooverwritepercent=1 --prefixpercent=0 --readpercent=2 --key_len_percent_dist=1,30,69
# Before this PR, the following test will fail the asserts with error msg like the following
# Assertion failed: (size_key <= key_gen_ctx.weights.size() * sizeof(uint64_t)), function GetIntVal, file db_stress_common.h, line 524.
./db_stress --verify_iterator_with_expected_state_one_in=1 --max_key_len=3 --max_key=100000000 --skip_verifydb=1 --continuous_verification_interval=0 --writepercent=0 --delpercent=3 --delrangepercent=0 --iterpercent=95 --nooverwritepercent=1 --prefixpercent=0 --readpercent=2 --key_len_percent_dist=1,30,69 --destroy_db_initially=0
```
Reviewed By: ajkr
Differential Revision: D39085243
Pulled By: cbi42
fbshipit-source-id: a7dfee2320c330773b623b442d730fd014ec7056
2022-08-29 16:51:40 +00:00
|
|
|
}
|
2022-10-14 21:25:05 +00:00
|
|
|
|
Verify Iterator/Get() against expected state in only `no_batched_ops_test` (#10590)
Summary:
https://github.com/facebook/rocksdb/issues/10538 added `TestIterateAgainstExpected()` in `no_batched_ops_test` to verify iterator correctness against the in memory expected state. It is not compatible when run after some other stress tests, e.g. `TestPut()` in `batched_op_stress`, that either do not set expected state when writing to DB or use keys that cannot be parsed by `GetIntVal()`. The assert [here](https://github.com/facebook/rocksdb/blob/d17be55aab80b856f96f4af89f8d18fef96646b4/db_stress_tool/db_stress_common.h#L520) could fail. This PR fixed this issue by setting iterator upperbound to `max_key` when `destroy_db_initially=0` to avoid the key space that `batched_op_stress` touches.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10590
Test Plan:
```
# set up DB with batched_op_stress
./db_stress --test_batches_snapshots=1 --verify_iterator_with_expected_state_one_in=1 --max_key_len=3 --max_key=100000000 --skip_verifydb=1 --continuous_verification_interval=0 --writepercent=85 --delpercent=3 --delrangepercent=0 --iterpercent=10 --nooverwritepercent=1 --prefixpercent=0 --readpercent=2 --key_len_percent_dist=1,30,69
# Before this PR, the following test will fail the asserts with error msg like the following
# Assertion failed: (size_key <= key_gen_ctx.weights.size() * sizeof(uint64_t)), function GetIntVal, file db_stress_common.h, line 524.
./db_stress --verify_iterator_with_expected_state_one_in=1 --max_key_len=3 --max_key=100000000 --skip_verifydb=1 --continuous_verification_interval=0 --writepercent=0 --delpercent=3 --delrangepercent=0 --iterpercent=95 --nooverwritepercent=1 --prefixpercent=0 --readpercent=2 --key_len_percent_dist=1,30,69 --destroy_db_initially=0
```
Reviewed By: ajkr
Differential Revision: D39085243
Pulled By: cbi42
fbshipit-source-id: a7dfee2320c330773b623b442d730fd014ec7056
2022-08-29 16:51:40 +00:00
|
|
|
std::string max_key_str;
|
|
|
|
Slice max_key_slice;
|
|
|
|
if (!FLAGS_destroy_db_initially) {
|
|
|
|
max_key_str = Key(max_key);
|
|
|
|
max_key_slice = max_key_str;
|
|
|
|
// to restrict iterator from reading keys written in batched_op_stress
|
|
|
|
// that do not have expected state updated and may not be parseable by
|
|
|
|
// GetIntVal().
|
2022-10-14 21:25:05 +00:00
|
|
|
ro.iterate_upper_bound = &max_key_slice;
|
Verify Iterator/Get() against expected state in only `no_batched_ops_test` (#10590)
Summary:
https://github.com/facebook/rocksdb/issues/10538 added `TestIterateAgainstExpected()` in `no_batched_ops_test` to verify iterator correctness against the in memory expected state. It is not compatible when run after some other stress tests, e.g. `TestPut()` in `batched_op_stress`, that either do not set expected state when writing to DB or use keys that cannot be parsed by `GetIntVal()`. The assert [here](https://github.com/facebook/rocksdb/blob/d17be55aab80b856f96f4af89f8d18fef96646b4/db_stress_tool/db_stress_common.h#L520) could fail. This PR fixed this issue by setting iterator upperbound to `max_key` when `destroy_db_initially=0` to avoid the key space that `batched_op_stress` touches.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10590
Test Plan:
```
# set up DB with batched_op_stress
./db_stress --test_batches_snapshots=1 --verify_iterator_with_expected_state_one_in=1 --max_key_len=3 --max_key=100000000 --skip_verifydb=1 --continuous_verification_interval=0 --writepercent=85 --delpercent=3 --delrangepercent=0 --iterpercent=10 --nooverwritepercent=1 --prefixpercent=0 --readpercent=2 --key_len_percent_dist=1,30,69
# Before this PR, the following test will fail the asserts with error msg like the following
# Assertion failed: (size_key <= key_gen_ctx.weights.size() * sizeof(uint64_t)), function GetIntVal, file db_stress_common.h, line 524.
./db_stress --verify_iterator_with_expected_state_one_in=1 --max_key_len=3 --max_key=100000000 --skip_verifydb=1 --continuous_verification_interval=0 --writepercent=0 --delpercent=3 --delrangepercent=0 --iterpercent=95 --nooverwritepercent=1 --prefixpercent=0 --readpercent=2 --key_len_percent_dist=1,30,69 --destroy_db_initially=0
```
Reviewed By: ajkr
Differential Revision: D39085243
Pulled By: cbi42
fbshipit-source-id: a7dfee2320c330773b623b442d730fd014ec7056
2022-08-29 16:51:40 +00:00
|
|
|
}
|
2024-06-18 23:16:09 +00:00
|
|
|
std::string ub_str, lb_str;
|
|
|
|
if (FLAGS_use_sqfc_for_range_queries) {
|
|
|
|
ub_str = Key(ub);
|
|
|
|
lb_str = Key(lb);
|
|
|
|
ro.table_filter =
|
|
|
|
sqfc_factory_->GetTableFilterForRangeQuery(lb_str, ub_str);
|
|
|
|
}
|
2022-10-14 21:25:05 +00:00
|
|
|
|
|
|
|
ColumnFamilyHandle* const cfh = column_families_[rand_column_family];
|
|
|
|
assert(cfh);
|
|
|
|
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
const std::size_t expected_values_size = static_cast<std::size_t>(ub - lb);
|
|
|
|
std::vector<ExpectedValue> pre_read_expected_values;
|
|
|
|
std::vector<ExpectedValue> post_read_expected_values;
|
|
|
|
|
|
|
|
for (int64_t i = 0; i < static_cast<int64_t>(expected_values_size); ++i) {
|
|
|
|
pre_read_expected_values.push_back(
|
|
|
|
shared->Get(rand_column_family, i + lb));
|
|
|
|
}
|
2024-05-31 22:17:06 +00:00
|
|
|
std::unique_ptr<Iterator> iter;
|
|
|
|
if (FLAGS_use_multi_cf_iterator) {
|
|
|
|
std::vector<ColumnFamilyHandle*> cfhs;
|
|
|
|
cfhs.reserve(rand_column_families.size());
|
|
|
|
for (auto cf_index : rand_column_families) {
|
|
|
|
cfhs.emplace_back(column_families_[cf_index]);
|
|
|
|
}
|
|
|
|
assert(!cfhs.empty());
|
|
|
|
iter = db_->NewCoalescingIterator(ro, cfhs);
|
|
|
|
} else {
|
|
|
|
iter = std::unique_ptr<Iterator>(db_->NewIterator(ro, cfh));
|
|
|
|
}
|
|
|
|
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
for (int64_t i = 0; i < static_cast<int64_t>(expected_values_size); ++i) {
|
|
|
|
post_read_expected_values.push_back(
|
|
|
|
shared->Get(rand_column_family, i + lb));
|
|
|
|
}
|
|
|
|
|
|
|
|
assert(pre_read_expected_values.size() == expected_values_size &&
|
|
|
|
pre_read_expected_values.size() == post_read_expected_values.size());
|
2022-10-14 21:25:05 +00:00
|
|
|
|
Add Iterator test against expected state to stress test (#10538)
Summary:
As mentioned in https://github.com/facebook/rocksdb/pull/5506#issuecomment-506021913,
`db_stress` does not have much verification for iterator correctness.
It has a `TestIterate()` function, but that is mainly for comparing results
between two iterators, one with `total_order_seek` and the other optionally
sets auto_prefix, upper/lower bounds. Commit 49a0581ad2462e31aa3f768afa769e0d33390f33
added a new `TestIterateAgainstExpected()` function that compares iterator against
expected state. It locks a range of keys, creates an iterator, does
a random sequence of `Next/Prev` and compares against expected state.
This PR is based on that commit, the main changes include some logs
(for easier debugging if a test fails), a forward and backward scan to
cover the entire locked key range, and a flag for optionally turning on
this version of Iterator testing.
Added constraint that the checks against expected state in
`TestIterateAgainstExpected()` and in `TestGet()` are only turned on
when `--skip_verifydb` flag is not set.
Remove the change log introduced in https://github.com/facebook/rocksdb/issues/10553.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10538
Test Plan:
Run `db_stress` with `--verify_iterator_with_expected_state_one_in=1`,
and a large `--iterpercent` and `--num_iterations`. Checked `op_logs`
manually to ensure expected coverage. Tweaked part of the code in
https://github.com/facebook/rocksdb/issues/10449 and stress test was able to catch it.
- internally run various flavor of crash test
Reviewed By: ajkr
Differential Revision: D38847269
Pulled By: cbi42
fbshipit-source-id: 8b4402a9bba9f6cfa08051943cd672579d489599
2022-08-24 21:59:50 +00:00
|
|
|
std::string op_logs;
|
2022-10-14 21:25:05 +00:00
|
|
|
|
|
|
|
auto check_columns = [&]() {
|
|
|
|
assert(iter);
|
|
|
|
assert(iter->Valid());
|
|
|
|
|
2023-03-17 21:47:29 +00:00
|
|
|
if (!VerifyWideColumns(iter->value(), iter->columns())) {
|
2022-10-14 21:25:05 +00:00
|
|
|
shared->SetVerificationFailure();
|
|
|
|
|
|
|
|
fprintf(stderr,
|
|
|
|
"Verification failed for key %s: "
|
2023-03-17 21:47:29 +00:00
|
|
|
"Value and columns inconsistent: value: %s, columns: %s\n",
|
2022-10-14 21:25:05 +00:00
|
|
|
Slice(iter->key()).ToString(/* hex */ true).c_str(),
|
2023-03-17 21:47:29 +00:00
|
|
|
iter->value().ToString(/* hex */ true).c_str(),
|
|
|
|
WideColumnsToHex(iter->columns()).c_str());
|
2022-10-14 21:25:05 +00:00
|
|
|
fprintf(stderr, "Column family: %s, op_logs: %s\n",
|
|
|
|
cfh->GetName().c_str(), op_logs.c_str());
|
|
|
|
|
|
|
|
thread->stats.AddErrors(1);
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
};
|
Add Iterator test against expected state to stress test (#10538)
Summary:
As mentioned in https://github.com/facebook/rocksdb/pull/5506#issuecomment-506021913,
`db_stress` does not have much verification for iterator correctness.
It has a `TestIterate()` function, but that is mainly for comparing results
between two iterators, one with `total_order_seek` and the other optionally
sets auto_prefix, upper/lower bounds. Commit 49a0581ad2462e31aa3f768afa769e0d33390f33
added a new `TestIterateAgainstExpected()` function that compares iterator against
expected state. It locks a range of keys, creates an iterator, does
a random sequence of `Next/Prev` and compares against expected state.
This PR is based on that commit, the main changes include some logs
(for easier debugging if a test fails), a forward and backward scan to
cover the entire locked key range, and a flag for optionally turning on
this version of Iterator testing.
Added constraint that the checks against expected state in
`TestIterateAgainstExpected()` and in `TestGet()` are only turned on
when `--skip_verifydb` flag is not set.
Remove the change log introduced in https://github.com/facebook/rocksdb/issues/10553.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10538
Test Plan:
Run `db_stress` with `--verify_iterator_with_expected_state_one_in=1`,
and a large `--iterpercent` and `--num_iterations`. Checked `op_logs`
manually to ensure expected coverage. Tweaked part of the code in
https://github.com/facebook/rocksdb/issues/10449 and stress test was able to catch it.
- internally run various flavor of crash test
Reviewed By: ajkr
Differential Revision: D38847269
Pulled By: cbi42
fbshipit-source-id: 8b4402a9bba9f6cfa08051943cd672579d489599
2022-08-24 21:59:50 +00:00
|
|
|
|
|
|
|
auto check_no_key_in_range = [&](int64_t start, int64_t end) {
|
2023-08-14 21:57:28 +00:00
|
|
|
assert(start <= end);
|
Add Iterator test against expected state to stress test (#10538)
Summary:
As mentioned in https://github.com/facebook/rocksdb/pull/5506#issuecomment-506021913,
`db_stress` does not have much verification for iterator correctness.
It has a `TestIterate()` function, but that is mainly for comparing results
between two iterators, one with `total_order_seek` and the other optionally
sets auto_prefix, upper/lower bounds. Commit 49a0581ad2462e31aa3f768afa769e0d33390f33
added a new `TestIterateAgainstExpected()` function that compares iterator against
expected state. It locks a range of keys, creates an iterator, does
a random sequence of `Next/Prev` and compares against expected state.
This PR is based on that commit, the main changes include some logs
(for easier debugging if a test fails), a forward and backward scan to
cover the entire locked key range, and a flag for optionally turning on
this version of Iterator testing.
Added constraint that the checks against expected state in
`TestIterateAgainstExpected()` and in `TestGet()` are only turned on
when `--skip_verifydb` flag is not set.
Remove the change log introduced in https://github.com/facebook/rocksdb/issues/10553.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10538
Test Plan:
Run `db_stress` with `--verify_iterator_with_expected_state_one_in=1`,
and a large `--iterpercent` and `--num_iterations`. Checked `op_logs`
manually to ensure expected coverage. Tweaked part of the code in
https://github.com/facebook/rocksdb/issues/10449 and stress test was able to catch it.
- internally run various flavor of crash test
Reviewed By: ajkr
Differential Revision: D38847269
Pulled By: cbi42
fbshipit-source-id: 8b4402a9bba9f6cfa08051943cd672579d489599
2022-08-24 21:59:50 +00:00
|
|
|
for (auto j = std::max(start, lb); j < std::min(end, ub); ++j) {
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
std::size_t index = static_cast<std::size_t>(j - lb);
|
|
|
|
assert(index < pre_read_expected_values.size() &&
|
|
|
|
index < post_read_expected_values.size());
|
|
|
|
const ExpectedValue pre_read_expected_value =
|
|
|
|
pre_read_expected_values[index];
|
|
|
|
const ExpectedValue post_read_expected_value =
|
|
|
|
post_read_expected_values[index];
|
|
|
|
if (ExpectedValueHelper::MustHaveExisted(pre_read_expected_value,
|
|
|
|
post_read_expected_value)) {
|
Add Iterator test against expected state to stress test (#10538)
Summary:
As mentioned in https://github.com/facebook/rocksdb/pull/5506#issuecomment-506021913,
`db_stress` does not have much verification for iterator correctness.
It has a `TestIterate()` function, but that is mainly for comparing results
between two iterators, one with `total_order_seek` and the other optionally
sets auto_prefix, upper/lower bounds. Commit 49a0581ad2462e31aa3f768afa769e0d33390f33
added a new `TestIterateAgainstExpected()` function that compares iterator against
expected state. It locks a range of keys, creates an iterator, does
a random sequence of `Next/Prev` and compares against expected state.
This PR is based on that commit, the main changes include some logs
(for easier debugging if a test fails), a forward and backward scan to
cover the entire locked key range, and a flag for optionally turning on
this version of Iterator testing.
Added constraint that the checks against expected state in
`TestIterateAgainstExpected()` and in `TestGet()` are only turned on
when `--skip_verifydb` flag is not set.
Remove the change log introduced in https://github.com/facebook/rocksdb/issues/10553.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10538
Test Plan:
Run `db_stress` with `--verify_iterator_with_expected_state_one_in=1`,
and a large `--iterpercent` and `--num_iterations`. Checked `op_logs`
manually to ensure expected coverage. Tweaked part of the code in
https://github.com/facebook/rocksdb/issues/10449 and stress test was able to catch it.
- internally run various flavor of crash test
Reviewed By: ajkr
Differential Revision: D38847269
Pulled By: cbi42
fbshipit-source-id: 8b4402a9bba9f6cfa08051943cd672579d489599
2022-08-24 21:59:50 +00:00
|
|
|
// Fail fast to preserve the DB state.
|
|
|
|
thread->shared->SetVerificationFailure();
|
|
|
|
if (iter->Valid()) {
|
|
|
|
fprintf(stderr,
|
2023-11-03 16:53:22 +00:00
|
|
|
"Verification failed. Expected state has key %s, iterator "
|
|
|
|
"is at key %s\n",
|
Add Iterator test against expected state to stress test (#10538)
Summary:
As mentioned in https://github.com/facebook/rocksdb/pull/5506#issuecomment-506021913,
`db_stress` does not have much verification for iterator correctness.
It has a `TestIterate()` function, but that is mainly for comparing results
between two iterators, one with `total_order_seek` and the other optionally
sets auto_prefix, upper/lower bounds. Commit 49a0581ad2462e31aa3f768afa769e0d33390f33
added a new `TestIterateAgainstExpected()` function that compares iterator against
expected state. It locks a range of keys, creates an iterator, does
a random sequence of `Next/Prev` and compares against expected state.
This PR is based on that commit, the main changes include some logs
(for easier debugging if a test fails), a forward and backward scan to
cover the entire locked key range, and a flag for optionally turning on
this version of Iterator testing.
Added constraint that the checks against expected state in
`TestIterateAgainstExpected()` and in `TestGet()` are only turned on
when `--skip_verifydb` flag is not set.
Remove the change log introduced in https://github.com/facebook/rocksdb/issues/10553.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10538
Test Plan:
Run `db_stress` with `--verify_iterator_with_expected_state_one_in=1`,
and a large `--iterpercent` and `--num_iterations`. Checked `op_logs`
manually to ensure expected coverage. Tweaked part of the code in
https://github.com/facebook/rocksdb/issues/10449 and stress test was able to catch it.
- internally run various flavor of crash test
Reviewed By: ajkr
Differential Revision: D38847269
Pulled By: cbi42
fbshipit-source-id: 8b4402a9bba9f6cfa08051943cd672579d489599
2022-08-24 21:59:50 +00:00
|
|
|
Slice(Key(j)).ToString(true).c_str(),
|
|
|
|
iter->key().ToString(true).c_str());
|
|
|
|
} else {
|
2023-11-03 16:53:22 +00:00
|
|
|
fprintf(stderr,
|
|
|
|
"Verification failed. Expected state has key %s, iterator "
|
|
|
|
"is invalid\n",
|
Add Iterator test against expected state to stress test (#10538)
Summary:
As mentioned in https://github.com/facebook/rocksdb/pull/5506#issuecomment-506021913,
`db_stress` does not have much verification for iterator correctness.
It has a `TestIterate()` function, but that is mainly for comparing results
between two iterators, one with `total_order_seek` and the other optionally
sets auto_prefix, upper/lower bounds. Commit 49a0581ad2462e31aa3f768afa769e0d33390f33
added a new `TestIterateAgainstExpected()` function that compares iterator against
expected state. It locks a range of keys, creates an iterator, does
a random sequence of `Next/Prev` and compares against expected state.
This PR is based on that commit, the main changes include some logs
(for easier debugging if a test fails), a forward and backward scan to
cover the entire locked key range, and a flag for optionally turning on
this version of Iterator testing.
Added constraint that the checks against expected state in
`TestIterateAgainstExpected()` and in `TestGet()` are only turned on
when `--skip_verifydb` flag is not set.
Remove the change log introduced in https://github.com/facebook/rocksdb/issues/10553.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10538
Test Plan:
Run `db_stress` with `--verify_iterator_with_expected_state_one_in=1`,
and a large `--iterpercent` and `--num_iterations`. Checked `op_logs`
manually to ensure expected coverage. Tweaked part of the code in
https://github.com/facebook/rocksdb/issues/10449 and stress test was able to catch it.
- internally run various flavor of crash test
Reviewed By: ajkr
Differential Revision: D38847269
Pulled By: cbi42
fbshipit-source-id: 8b4402a9bba9f6cfa08051943cd672579d489599
2022-08-24 21:59:50 +00:00
|
|
|
Slice(Key(j)).ToString(true).c_str());
|
|
|
|
}
|
|
|
|
fprintf(stderr, "Column family: %s, op_logs: %s\n",
|
|
|
|
cfh->GetName().c_str(), op_logs.c_str());
|
|
|
|
thread->stats.AddErrors(1);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
};
|
|
|
|
|
|
|
|
// Forward and backward scan to ensure we cover the entire range [lb, ub).
|
|
|
|
// The random sequence Next and Prev test below tends to be very short
|
|
|
|
// ranged.
|
|
|
|
int64_t last_key = lb - 1;
|
2022-10-14 21:25:05 +00:00
|
|
|
|
Add Iterator test against expected state to stress test (#10538)
Summary:
As mentioned in https://github.com/facebook/rocksdb/pull/5506#issuecomment-506021913,
`db_stress` does not have much verification for iterator correctness.
It has a `TestIterate()` function, but that is mainly for comparing results
between two iterators, one with `total_order_seek` and the other optionally
sets auto_prefix, upper/lower bounds. Commit 49a0581ad2462e31aa3f768afa769e0d33390f33
added a new `TestIterateAgainstExpected()` function that compares iterator against
expected state. It locks a range of keys, creates an iterator, does
a random sequence of `Next/Prev` and compares against expected state.
This PR is based on that commit, the main changes include some logs
(for easier debugging if a test fails), a forward and backward scan to
cover the entire locked key range, and a flag for optionally turning on
this version of Iterator testing.
Added constraint that the checks against expected state in
`TestIterateAgainstExpected()` and in `TestGet()` are only turned on
when `--skip_verifydb` flag is not set.
Remove the change log introduced in https://github.com/facebook/rocksdb/issues/10553.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10538
Test Plan:
Run `db_stress` with `--verify_iterator_with_expected_state_one_in=1`,
and a large `--iterpercent` and `--num_iterations`. Checked `op_logs`
manually to ensure expected coverage. Tweaked part of the code in
https://github.com/facebook/rocksdb/issues/10449 and stress test was able to catch it.
- internally run various flavor of crash test
Reviewed By: ajkr
Differential Revision: D38847269
Pulled By: cbi42
fbshipit-source-id: 8b4402a9bba9f6cfa08051943cd672579d489599
2022-08-24 21:59:50 +00:00
|
|
|
std::string key_str = Key(lb);
|
2022-10-14 21:25:05 +00:00
|
|
|
iter->Seek(key_str);
|
|
|
|
|
Add Iterator test against expected state to stress test (#10538)
Summary:
As mentioned in https://github.com/facebook/rocksdb/pull/5506#issuecomment-506021913,
`db_stress` does not have much verification for iterator correctness.
It has a `TestIterate()` function, but that is mainly for comparing results
between two iterators, one with `total_order_seek` and the other optionally
sets auto_prefix, upper/lower bounds. Commit 49a0581ad2462e31aa3f768afa769e0d33390f33
added a new `TestIterateAgainstExpected()` function that compares iterator against
expected state. It locks a range of keys, creates an iterator, does
a random sequence of `Next/Prev` and compares against expected state.
This PR is based on that commit, the main changes include some logs
(for easier debugging if a test fails), a forward and backward scan to
cover the entire locked key range, and a flag for optionally turning on
this version of Iterator testing.
Added constraint that the checks against expected state in
`TestIterateAgainstExpected()` and in `TestGet()` are only turned on
when `--skip_verifydb` flag is not set.
Remove the change log introduced in https://github.com/facebook/rocksdb/issues/10553.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10538
Test Plan:
Run `db_stress` with `--verify_iterator_with_expected_state_one_in=1`,
and a large `--iterpercent` and `--num_iterations`. Checked `op_logs`
manually to ensure expected coverage. Tweaked part of the code in
https://github.com/facebook/rocksdb/issues/10449 and stress test was able to catch it.
- internally run various flavor of crash test
Reviewed By: ajkr
Differential Revision: D38847269
Pulled By: cbi42
fbshipit-source-id: 8b4402a9bba9f6cfa08051943cd672579d489599
2022-08-24 21:59:50 +00:00
|
|
|
op_logs += "S " + Slice(key_str).ToString(true) + " ";
|
2022-10-14 21:25:05 +00:00
|
|
|
|
|
|
|
uint64_t curr = 0;
|
Add Iterator test against expected state to stress test (#10538)
Summary:
As mentioned in https://github.com/facebook/rocksdb/pull/5506#issuecomment-506021913,
`db_stress` does not have much verification for iterator correctness.
It has a `TestIterate()` function, but that is mainly for comparing results
between two iterators, one with `total_order_seek` and the other optionally
sets auto_prefix, upper/lower bounds. Commit 49a0581ad2462e31aa3f768afa769e0d33390f33
added a new `TestIterateAgainstExpected()` function that compares iterator against
expected state. It locks a range of keys, creates an iterator, does
a random sequence of `Next/Prev` and compares against expected state.
This PR is based on that commit, the main changes include some logs
(for easier debugging if a test fails), a forward and backward scan to
cover the entire locked key range, and a flag for optionally turning on
this version of Iterator testing.
Added constraint that the checks against expected state in
`TestIterateAgainstExpected()` and in `TestGet()` are only turned on
when `--skip_verifydb` flag is not set.
Remove the change log introduced in https://github.com/facebook/rocksdb/issues/10553.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10538
Test Plan:
Run `db_stress` with `--verify_iterator_with_expected_state_one_in=1`,
and a large `--iterpercent` and `--num_iterations`. Checked `op_logs`
manually to ensure expected coverage. Tweaked part of the code in
https://github.com/facebook/rocksdb/issues/10449 and stress test was able to catch it.
- internally run various flavor of crash test
Reviewed By: ajkr
Differential Revision: D38847269
Pulled By: cbi42
fbshipit-source-id: 8b4402a9bba9f6cfa08051943cd672579d489599
2022-08-24 21:59:50 +00:00
|
|
|
while (true) {
|
2023-08-14 21:57:28 +00:00
|
|
|
assert(last_key < ub);
|
2024-11-11 03:21:35 +00:00
|
|
|
|
|
|
|
if (iter->Valid() && ro.allow_unprepared_value) {
|
|
|
|
op_logs += "*";
|
|
|
|
|
|
|
|
if (!iter->PrepareValue()) {
|
|
|
|
assert(!iter->Valid());
|
|
|
|
assert(!iter->status().ok());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Add Iterator test against expected state to stress test (#10538)
Summary:
As mentioned in https://github.com/facebook/rocksdb/pull/5506#issuecomment-506021913,
`db_stress` does not have much verification for iterator correctness.
It has a `TestIterate()` function, but that is mainly for comparing results
between two iterators, one with `total_order_seek` and the other optionally
sets auto_prefix, upper/lower bounds. Commit 49a0581ad2462e31aa3f768afa769e0d33390f33
added a new `TestIterateAgainstExpected()` function that compares iterator against
expected state. It locks a range of keys, creates an iterator, does
a random sequence of `Next/Prev` and compares against expected state.
This PR is based on that commit, the main changes include some logs
(for easier debugging if a test fails), a forward and backward scan to
cover the entire locked key range, and a flag for optionally turning on
this version of Iterator testing.
Added constraint that the checks against expected state in
`TestIterateAgainstExpected()` and in `TestGet()` are only turned on
when `--skip_verifydb` flag is not set.
Remove the change log introduced in https://github.com/facebook/rocksdb/issues/10553.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10538
Test Plan:
Run `db_stress` with `--verify_iterator_with_expected_state_one_in=1`,
and a large `--iterpercent` and `--num_iterations`. Checked `op_logs`
manually to ensure expected coverage. Tweaked part of the code in
https://github.com/facebook/rocksdb/issues/10449 and stress test was able to catch it.
- internally run various flavor of crash test
Reviewed By: ajkr
Differential Revision: D38847269
Pulled By: cbi42
fbshipit-source-id: 8b4402a9bba9f6cfa08051943cd672579d489599
2022-08-24 21:59:50 +00:00
|
|
|
if (!iter->Valid()) {
|
|
|
|
if (!iter->status().ok()) {
|
2024-06-25 03:51:39 +00:00
|
|
|
if (IsErrorInjectedAndRetryable(iter->status())) {
|
Inject more errors to more files in stress test (#12713)
Summary:
**Context:**
We currently have partial error injection:
- DB operation: all read, SST write
- DB open: all read, SST write, all metadata write.
This PR completes the error injection (with some limitations below):
- DB operation & open: all read, all write, all metadata write, all metadata read
**Summary:**
- Inject retryable metadata read, metadata write error concerning directory (e.g, dir sync, ) or file metadata (e.g, name, size, file creation/deletion...)
- Inject retryable errors to all major file types: random access file, sequential file, writable file
- Allow db stress test operations to handle above injected errors gracefully without crashing
- Change all error injection to thread-local implementation for easier disabling and enabling in the same thread. For example, we can control error handling thread to have no error injection. It's also cleaner in code.
- Limitation: compared to before, we now don't have write fault injection for backup/restore CopyOrCreateFiles work threads since they use anonymous background threads as well as read injection for db open bg thread
- Add a new flag to test error recovery without error injection so we can test the path where error recovery actually succeeds
- Some Refactory & fix to db stress test framework (see PR review comments)
- Fix some minor bugs surfaced (see PR review comments)
- Limitation: had to disable backup restore with metadata read/write injection since it surfaces too many testing issues. Will add it back later to focus on surfacing actual code/internal bugs first.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12713
Test Plan:
- Existing UT
- CI with no trivial error failure
Reviewed By: pdillinger
Differential Revision: D58326608
Pulled By: hx235
fbshipit-source-id: 011b5195aaeb6011641ae0a9194f7f2a0e325ad7
2024-06-19 15:42:00 +00:00
|
|
|
return iter->status();
|
|
|
|
} else {
|
|
|
|
thread->shared->SetVerificationFailure();
|
|
|
|
fprintf(stderr, "TestIterate against expected state error: %s\n",
|
|
|
|
iter->status().ToString().c_str());
|
|
|
|
fprintf(stderr, "Column family: %s, op_logs: %s\n",
|
|
|
|
cfh->GetName().c_str(), op_logs.c_str());
|
|
|
|
thread->stats.AddErrors(1);
|
|
|
|
return iter->status();
|
|
|
|
}
|
Add Iterator test against expected state to stress test (#10538)
Summary:
As mentioned in https://github.com/facebook/rocksdb/pull/5506#issuecomment-506021913,
`db_stress` does not have much verification for iterator correctness.
It has a `TestIterate()` function, but that is mainly for comparing results
between two iterators, one with `total_order_seek` and the other optionally
sets auto_prefix, upper/lower bounds. Commit 49a0581ad2462e31aa3f768afa769e0d33390f33
added a new `TestIterateAgainstExpected()` function that compares iterator against
expected state. It locks a range of keys, creates an iterator, does
a random sequence of `Next/Prev` and compares against expected state.
This PR is based on that commit, the main changes include some logs
(for easier debugging if a test fails), a forward and backward scan to
cover the entire locked key range, and a flag for optionally turning on
this version of Iterator testing.
Added constraint that the checks against expected state in
`TestIterateAgainstExpected()` and in `TestGet()` are only turned on
when `--skip_verifydb` flag is not set.
Remove the change log introduced in https://github.com/facebook/rocksdb/issues/10553.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10538
Test Plan:
Run `db_stress` with `--verify_iterator_with_expected_state_one_in=1`,
and a large `--iterpercent` and `--num_iterations`. Checked `op_logs`
manually to ensure expected coverage. Tweaked part of the code in
https://github.com/facebook/rocksdb/issues/10449 and stress test was able to catch it.
- internally run various flavor of crash test
Reviewed By: ajkr
Differential Revision: D38847269
Pulled By: cbi42
fbshipit-source-id: 8b4402a9bba9f6cfa08051943cd672579d489599
2022-08-24 21:59:50 +00:00
|
|
|
}
|
2022-10-14 21:25:05 +00:00
|
|
|
if (!check_no_key_in_range(last_key + 1, ub)) {
|
Add Iterator test against expected state to stress test (#10538)
Summary:
As mentioned in https://github.com/facebook/rocksdb/pull/5506#issuecomment-506021913,
`db_stress` does not have much verification for iterator correctness.
It has a `TestIterate()` function, but that is mainly for comparing results
between two iterators, one with `total_order_seek` and the other optionally
sets auto_prefix, upper/lower bounds. Commit 49a0581ad2462e31aa3f768afa769e0d33390f33
added a new `TestIterateAgainstExpected()` function that compares iterator against
expected state. It locks a range of keys, creates an iterator, does
a random sequence of `Next/Prev` and compares against expected state.
This PR is based on that commit, the main changes include some logs
(for easier debugging if a test fails), a forward and backward scan to
cover the entire locked key range, and a flag for optionally turning on
this version of Iterator testing.
Added constraint that the checks against expected state in
`TestIterateAgainstExpected()` and in `TestGet()` are only turned on
when `--skip_verifydb` flag is not set.
Remove the change log introduced in https://github.com/facebook/rocksdb/issues/10553.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10538
Test Plan:
Run `db_stress` with `--verify_iterator_with_expected_state_one_in=1`,
and a large `--iterpercent` and `--num_iterations`. Checked `op_logs`
manually to ensure expected coverage. Tweaked part of the code in
https://github.com/facebook/rocksdb/issues/10449 and stress test was able to catch it.
- internally run various flavor of crash test
Reviewed By: ajkr
Differential Revision: D38847269
Pulled By: cbi42
fbshipit-source-id: 8b4402a9bba9f6cfa08051943cd672579d489599
2022-08-24 21:59:50 +00:00
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
2022-10-14 21:25:05 +00:00
|
|
|
|
|
|
|
if (!check_columns()) {
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
Add Iterator test against expected state to stress test (#10538)
Summary:
As mentioned in https://github.com/facebook/rocksdb/pull/5506#issuecomment-506021913,
`db_stress` does not have much verification for iterator correctness.
It has a `TestIterate()` function, but that is mainly for comparing results
between two iterators, one with `total_order_seek` and the other optionally
sets auto_prefix, upper/lower bounds. Commit 49a0581ad2462e31aa3f768afa769e0d33390f33
added a new `TestIterateAgainstExpected()` function that compares iterator against
expected state. It locks a range of keys, creates an iterator, does
a random sequence of `Next/Prev` and compares against expected state.
This PR is based on that commit, the main changes include some logs
(for easier debugging if a test fails), a forward and backward scan to
cover the entire locked key range, and a flag for optionally turning on
this version of Iterator testing.
Added constraint that the checks against expected state in
`TestIterateAgainstExpected()` and in `TestGet()` are only turned on
when `--skip_verifydb` flag is not set.
Remove the change log introduced in https://github.com/facebook/rocksdb/issues/10553.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10538
Test Plan:
Run `db_stress` with `--verify_iterator_with_expected_state_one_in=1`,
and a large `--iterpercent` and `--num_iterations`. Checked `op_logs`
manually to ensure expected coverage. Tweaked part of the code in
https://github.com/facebook/rocksdb/issues/10449 and stress test was able to catch it.
- internally run various flavor of crash test
Reviewed By: ajkr
Differential Revision: D38847269
Pulled By: cbi42
fbshipit-source-id: 8b4402a9bba9f6cfa08051943cd672579d489599
2022-08-24 21:59:50 +00:00
|
|
|
// iter is valid, the range (last_key, current key) was skipped
|
|
|
|
GetIntVal(iter->key().ToString(), &curr);
|
2023-08-14 21:57:28 +00:00
|
|
|
if (static_cast<int64_t>(curr) <= last_key) {
|
|
|
|
thread->shared->SetVerificationFailure();
|
|
|
|
fprintf(stderr,
|
2023-11-03 16:53:22 +00:00
|
|
|
"TestIterateAgainstExpected failed: found unexpectedly small "
|
|
|
|
"key\n");
|
2023-08-14 21:57:28 +00:00
|
|
|
fprintf(stderr, "Column family: %s, op_logs: %s\n",
|
|
|
|
cfh->GetName().c_str(), op_logs.c_str());
|
|
|
|
fprintf(stderr, "Last op found key: %s, expected at least: %s\n",
|
|
|
|
Slice(Key(curr)).ToString(true).c_str(),
|
|
|
|
Slice(Key(last_key + 1)).ToString(true).c_str());
|
|
|
|
thread->stats.AddErrors(1);
|
|
|
|
return Status::OK();
|
|
|
|
}
|
Add Iterator test against expected state to stress test (#10538)
Summary:
As mentioned in https://github.com/facebook/rocksdb/pull/5506#issuecomment-506021913,
`db_stress` does not have much verification for iterator correctness.
It has a `TestIterate()` function, but that is mainly for comparing results
between two iterators, one with `total_order_seek` and the other optionally
sets auto_prefix, upper/lower bounds. Commit 49a0581ad2462e31aa3f768afa769e0d33390f33
added a new `TestIterateAgainstExpected()` function that compares iterator against
expected state. It locks a range of keys, creates an iterator, does
a random sequence of `Next/Prev` and compares against expected state.
This PR is based on that commit, the main changes include some logs
(for easier debugging if a test fails), a forward and backward scan to
cover the entire locked key range, and a flag for optionally turning on
this version of Iterator testing.
Added constraint that the checks against expected state in
`TestIterateAgainstExpected()` and in `TestGet()` are only turned on
when `--skip_verifydb` flag is not set.
Remove the change log introduced in https://github.com/facebook/rocksdb/issues/10553.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10538
Test Plan:
Run `db_stress` with `--verify_iterator_with_expected_state_one_in=1`,
and a large `--iterpercent` and `--num_iterations`. Checked `op_logs`
manually to ensure expected coverage. Tweaked part of the code in
https://github.com/facebook/rocksdb/issues/10449 and stress test was able to catch it.
- internally run various flavor of crash test
Reviewed By: ajkr
Differential Revision: D38847269
Pulled By: cbi42
fbshipit-source-id: 8b4402a9bba9f6cfa08051943cd672579d489599
2022-08-24 21:59:50 +00:00
|
|
|
if (!check_no_key_in_range(last_key + 1, static_cast<int64_t>(curr))) {
|
|
|
|
return Status::OK();
|
|
|
|
}
|
2022-10-14 21:25:05 +00:00
|
|
|
|
Add Iterator test against expected state to stress test (#10538)
Summary:
As mentioned in https://github.com/facebook/rocksdb/pull/5506#issuecomment-506021913,
`db_stress` does not have much verification for iterator correctness.
It has a `TestIterate()` function, but that is mainly for comparing results
between two iterators, one with `total_order_seek` and the other optionally
sets auto_prefix, upper/lower bounds. Commit 49a0581ad2462e31aa3f768afa769e0d33390f33
added a new `TestIterateAgainstExpected()` function that compares iterator against
expected state. It locks a range of keys, creates an iterator, does
a random sequence of `Next/Prev` and compares against expected state.
This PR is based on that commit, the main changes include some logs
(for easier debugging if a test fails), a forward and backward scan to
cover the entire locked key range, and a flag for optionally turning on
this version of Iterator testing.
Added constraint that the checks against expected state in
`TestIterateAgainstExpected()` and in `TestGet()` are only turned on
when `--skip_verifydb` flag is not set.
Remove the change log introduced in https://github.com/facebook/rocksdb/issues/10553.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10538
Test Plan:
Run `db_stress` with `--verify_iterator_with_expected_state_one_in=1`,
and a large `--iterpercent` and `--num_iterations`. Checked `op_logs`
manually to ensure expected coverage. Tweaked part of the code in
https://github.com/facebook/rocksdb/issues/10449 and stress test was able to catch it.
- internally run various flavor of crash test
Reviewed By: ajkr
Differential Revision: D38847269
Pulled By: cbi42
fbshipit-source-id: 8b4402a9bba9f6cfa08051943cd672579d489599
2022-08-24 21:59:50 +00:00
|
|
|
last_key = static_cast<int64_t>(curr);
|
|
|
|
if (last_key >= ub - 1) {
|
|
|
|
break;
|
|
|
|
}
|
2022-10-14 21:25:05 +00:00
|
|
|
|
Add Iterator test against expected state to stress test (#10538)
Summary:
As mentioned in https://github.com/facebook/rocksdb/pull/5506#issuecomment-506021913,
`db_stress` does not have much verification for iterator correctness.
It has a `TestIterate()` function, but that is mainly for comparing results
between two iterators, one with `total_order_seek` and the other optionally
sets auto_prefix, upper/lower bounds. Commit 49a0581ad2462e31aa3f768afa769e0d33390f33
added a new `TestIterateAgainstExpected()` function that compares iterator against
expected state. It locks a range of keys, creates an iterator, does
a random sequence of `Next/Prev` and compares against expected state.
This PR is based on that commit, the main changes include some logs
(for easier debugging if a test fails), a forward and backward scan to
cover the entire locked key range, and a flag for optionally turning on
this version of Iterator testing.
Added constraint that the checks against expected state in
`TestIterateAgainstExpected()` and in `TestGet()` are only turned on
when `--skip_verifydb` flag is not set.
Remove the change log introduced in https://github.com/facebook/rocksdb/issues/10553.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10538
Test Plan:
Run `db_stress` with `--verify_iterator_with_expected_state_one_in=1`,
and a large `--iterpercent` and `--num_iterations`. Checked `op_logs`
manually to ensure expected coverage. Tweaked part of the code in
https://github.com/facebook/rocksdb/issues/10449 and stress test was able to catch it.
- internally run various flavor of crash test
Reviewed By: ajkr
Differential Revision: D38847269
Pulled By: cbi42
fbshipit-source-id: 8b4402a9bba9f6cfa08051943cd672579d489599
2022-08-24 21:59:50 +00:00
|
|
|
iter->Next();
|
2022-10-14 21:25:05 +00:00
|
|
|
|
Add Iterator test against expected state to stress test (#10538)
Summary:
As mentioned in https://github.com/facebook/rocksdb/pull/5506#issuecomment-506021913,
`db_stress` does not have much verification for iterator correctness.
It has a `TestIterate()` function, but that is mainly for comparing results
between two iterators, one with `total_order_seek` and the other optionally
sets auto_prefix, upper/lower bounds. Commit 49a0581ad2462e31aa3f768afa769e0d33390f33
added a new `TestIterateAgainstExpected()` function that compares iterator against
expected state. It locks a range of keys, creates an iterator, does
a random sequence of `Next/Prev` and compares against expected state.
This PR is based on that commit, the main changes include some logs
(for easier debugging if a test fails), a forward and backward scan to
cover the entire locked key range, and a flag for optionally turning on
this version of Iterator testing.
Added constraint that the checks against expected state in
`TestIterateAgainstExpected()` and in `TestGet()` are only turned on
when `--skip_verifydb` flag is not set.
Remove the change log introduced in https://github.com/facebook/rocksdb/issues/10553.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10538
Test Plan:
Run `db_stress` with `--verify_iterator_with_expected_state_one_in=1`,
and a large `--iterpercent` and `--num_iterations`. Checked `op_logs`
manually to ensure expected coverage. Tweaked part of the code in
https://github.com/facebook/rocksdb/issues/10449 and stress test was able to catch it.
- internally run various flavor of crash test
Reviewed By: ajkr
Differential Revision: D38847269
Pulled By: cbi42
fbshipit-source-id: 8b4402a9bba9f6cfa08051943cd672579d489599
2022-08-24 21:59:50 +00:00
|
|
|
op_logs += "N";
|
|
|
|
}
|
|
|
|
|
|
|
|
// backward scan
|
|
|
|
key_str = Key(ub - 1);
|
2022-10-14 21:25:05 +00:00
|
|
|
iter->SeekForPrev(key_str);
|
|
|
|
|
Add Iterator test against expected state to stress test (#10538)
Summary:
As mentioned in https://github.com/facebook/rocksdb/pull/5506#issuecomment-506021913,
`db_stress` does not have much verification for iterator correctness.
It has a `TestIterate()` function, but that is mainly for comparing results
between two iterators, one with `total_order_seek` and the other optionally
sets auto_prefix, upper/lower bounds. Commit 49a0581ad2462e31aa3f768afa769e0d33390f33
added a new `TestIterateAgainstExpected()` function that compares iterator against
expected state. It locks a range of keys, creates an iterator, does
a random sequence of `Next/Prev` and compares against expected state.
This PR is based on that commit, the main changes include some logs
(for easier debugging if a test fails), a forward and backward scan to
cover the entire locked key range, and a flag for optionally turning on
this version of Iterator testing.
Added constraint that the checks against expected state in
`TestIterateAgainstExpected()` and in `TestGet()` are only turned on
when `--skip_verifydb` flag is not set.
Remove the change log introduced in https://github.com/facebook/rocksdb/issues/10553.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10538
Test Plan:
Run `db_stress` with `--verify_iterator_with_expected_state_one_in=1`,
and a large `--iterpercent` and `--num_iterations`. Checked `op_logs`
manually to ensure expected coverage. Tweaked part of the code in
https://github.com/facebook/rocksdb/issues/10449 and stress test was able to catch it.
- internally run various flavor of crash test
Reviewed By: ajkr
Differential Revision: D38847269
Pulled By: cbi42
fbshipit-source-id: 8b4402a9bba9f6cfa08051943cd672579d489599
2022-08-24 21:59:50 +00:00
|
|
|
op_logs += " SFP " + Slice(key_str).ToString(true) + " ";
|
2022-10-14 21:25:05 +00:00
|
|
|
|
Add Iterator test against expected state to stress test (#10538)
Summary:
As mentioned in https://github.com/facebook/rocksdb/pull/5506#issuecomment-506021913,
`db_stress` does not have much verification for iterator correctness.
It has a `TestIterate()` function, but that is mainly for comparing results
between two iterators, one with `total_order_seek` and the other optionally
sets auto_prefix, upper/lower bounds. Commit 49a0581ad2462e31aa3f768afa769e0d33390f33
added a new `TestIterateAgainstExpected()` function that compares iterator against
expected state. It locks a range of keys, creates an iterator, does
a random sequence of `Next/Prev` and compares against expected state.
This PR is based on that commit, the main changes include some logs
(for easier debugging if a test fails), a forward and backward scan to
cover the entire locked key range, and a flag for optionally turning on
this version of Iterator testing.
Added constraint that the checks against expected state in
`TestIterateAgainstExpected()` and in `TestGet()` are only turned on
when `--skip_verifydb` flag is not set.
Remove the change log introduced in https://github.com/facebook/rocksdb/issues/10553.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10538
Test Plan:
Run `db_stress` with `--verify_iterator_with_expected_state_one_in=1`,
and a large `--iterpercent` and `--num_iterations`. Checked `op_logs`
manually to ensure expected coverage. Tweaked part of the code in
https://github.com/facebook/rocksdb/issues/10449 and stress test was able to catch it.
- internally run various flavor of crash test
Reviewed By: ajkr
Differential Revision: D38847269
Pulled By: cbi42
fbshipit-source-id: 8b4402a9bba9f6cfa08051943cd672579d489599
2022-08-24 21:59:50 +00:00
|
|
|
last_key = ub;
|
|
|
|
while (true) {
|
2023-08-14 21:57:28 +00:00
|
|
|
assert(lb < last_key);
|
2024-11-11 03:21:35 +00:00
|
|
|
|
|
|
|
if (iter->Valid() && ro.allow_unprepared_value) {
|
|
|
|
op_logs += "*";
|
|
|
|
|
|
|
|
if (!iter->PrepareValue()) {
|
|
|
|
assert(!iter->Valid());
|
|
|
|
assert(!iter->status().ok());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Add Iterator test against expected state to stress test (#10538)
Summary:
As mentioned in https://github.com/facebook/rocksdb/pull/5506#issuecomment-506021913,
`db_stress` does not have much verification for iterator correctness.
It has a `TestIterate()` function, but that is mainly for comparing results
between two iterators, one with `total_order_seek` and the other optionally
sets auto_prefix, upper/lower bounds. Commit 49a0581ad2462e31aa3f768afa769e0d33390f33
added a new `TestIterateAgainstExpected()` function that compares iterator against
expected state. It locks a range of keys, creates an iterator, does
a random sequence of `Next/Prev` and compares against expected state.
This PR is based on that commit, the main changes include some logs
(for easier debugging if a test fails), a forward and backward scan to
cover the entire locked key range, and a flag for optionally turning on
this version of Iterator testing.
Added constraint that the checks against expected state in
`TestIterateAgainstExpected()` and in `TestGet()` are only turned on
when `--skip_verifydb` flag is not set.
Remove the change log introduced in https://github.com/facebook/rocksdb/issues/10553.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10538
Test Plan:
Run `db_stress` with `--verify_iterator_with_expected_state_one_in=1`,
and a large `--iterpercent` and `--num_iterations`. Checked `op_logs`
manually to ensure expected coverage. Tweaked part of the code in
https://github.com/facebook/rocksdb/issues/10449 and stress test was able to catch it.
- internally run various flavor of crash test
Reviewed By: ajkr
Differential Revision: D38847269
Pulled By: cbi42
fbshipit-source-id: 8b4402a9bba9f6cfa08051943cd672579d489599
2022-08-24 21:59:50 +00:00
|
|
|
if (!iter->Valid()) {
|
|
|
|
if (!iter->status().ok()) {
|
2024-06-25 03:51:39 +00:00
|
|
|
if (IsErrorInjectedAndRetryable(iter->status())) {
|
Inject more errors to more files in stress test (#12713)
Summary:
**Context:**
We currently have partial error injection:
- DB operation: all read, SST write
- DB open: all read, SST write, all metadata write.
This PR completes the error injection (with some limitations below):
- DB operation & open: all read, all write, all metadata write, all metadata read
**Summary:**
- Inject retryable metadata read, metadata write error concerning directory (e.g, dir sync, ) or file metadata (e.g, name, size, file creation/deletion...)
- Inject retryable errors to all major file types: random access file, sequential file, writable file
- Allow db stress test operations to handle above injected errors gracefully without crashing
- Change all error injection to thread-local implementation for easier disabling and enabling in the same thread. For example, we can control error handling thread to have no error injection. It's also cleaner in code.
- Limitation: compared to before, we now don't have write fault injection for backup/restore CopyOrCreateFiles work threads since they use anonymous background threads as well as read injection for db open bg thread
- Add a new flag to test error recovery without error injection so we can test the path where error recovery actually succeeds
- Some Refactory & fix to db stress test framework (see PR review comments)
- Fix some minor bugs surfaced (see PR review comments)
- Limitation: had to disable backup restore with metadata read/write injection since it surfaces too many testing issues. Will add it back later to focus on surfacing actual code/internal bugs first.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12713
Test Plan:
- Existing UT
- CI with no trivial error failure
Reviewed By: pdillinger
Differential Revision: D58326608
Pulled By: hx235
fbshipit-source-id: 011b5195aaeb6011641ae0a9194f7f2a0e325ad7
2024-06-19 15:42:00 +00:00
|
|
|
return iter->status();
|
|
|
|
} else {
|
|
|
|
thread->shared->SetVerificationFailure();
|
|
|
|
fprintf(stderr, "TestIterate against expected state error: %s\n",
|
|
|
|
iter->status().ToString().c_str());
|
|
|
|
fprintf(stderr, "Column family: %s, op_logs: %s\n",
|
|
|
|
cfh->GetName().c_str(), op_logs.c_str());
|
|
|
|
thread->stats.AddErrors(1);
|
|
|
|
return iter->status();
|
|
|
|
}
|
Add Iterator test against expected state to stress test (#10538)
Summary:
As mentioned in https://github.com/facebook/rocksdb/pull/5506#issuecomment-506021913,
`db_stress` does not have much verification for iterator correctness.
It has a `TestIterate()` function, but that is mainly for comparing results
between two iterators, one with `total_order_seek` and the other optionally
sets auto_prefix, upper/lower bounds. Commit 49a0581ad2462e31aa3f768afa769e0d33390f33
added a new `TestIterateAgainstExpected()` function that compares iterator against
expected state. It locks a range of keys, creates an iterator, does
a random sequence of `Next/Prev` and compares against expected state.
This PR is based on that commit, the main changes include some logs
(for easier debugging if a test fails), a forward and backward scan to
cover the entire locked key range, and a flag for optionally turning on
this version of Iterator testing.
Added constraint that the checks against expected state in
`TestIterateAgainstExpected()` and in `TestGet()` are only turned on
when `--skip_verifydb` flag is not set.
Remove the change log introduced in https://github.com/facebook/rocksdb/issues/10553.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10538
Test Plan:
Run `db_stress` with `--verify_iterator_with_expected_state_one_in=1`,
and a large `--iterpercent` and `--num_iterations`. Checked `op_logs`
manually to ensure expected coverage. Tweaked part of the code in
https://github.com/facebook/rocksdb/issues/10449 and stress test was able to catch it.
- internally run various flavor of crash test
Reviewed By: ajkr
Differential Revision: D38847269
Pulled By: cbi42
fbshipit-source-id: 8b4402a9bba9f6cfa08051943cd672579d489599
2022-08-24 21:59:50 +00:00
|
|
|
}
|
|
|
|
if (!check_no_key_in_range(lb, last_key)) {
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
2022-10-14 21:25:05 +00:00
|
|
|
|
|
|
|
if (!check_columns()) {
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
Add Iterator test against expected state to stress test (#10538)
Summary:
As mentioned in https://github.com/facebook/rocksdb/pull/5506#issuecomment-506021913,
`db_stress` does not have much verification for iterator correctness.
It has a `TestIterate()` function, but that is mainly for comparing results
between two iterators, one with `total_order_seek` and the other optionally
sets auto_prefix, upper/lower bounds. Commit 49a0581ad2462e31aa3f768afa769e0d33390f33
added a new `TestIterateAgainstExpected()` function that compares iterator against
expected state. It locks a range of keys, creates an iterator, does
a random sequence of `Next/Prev` and compares against expected state.
This PR is based on that commit, the main changes include some logs
(for easier debugging if a test fails), a forward and backward scan to
cover the entire locked key range, and a flag for optionally turning on
this version of Iterator testing.
Added constraint that the checks against expected state in
`TestIterateAgainstExpected()` and in `TestGet()` are only turned on
when `--skip_verifydb` flag is not set.
Remove the change log introduced in https://github.com/facebook/rocksdb/issues/10553.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10538
Test Plan:
Run `db_stress` with `--verify_iterator_with_expected_state_one_in=1`,
and a large `--iterpercent` and `--num_iterations`. Checked `op_logs`
manually to ensure expected coverage. Tweaked part of the code in
https://github.com/facebook/rocksdb/issues/10449 and stress test was able to catch it.
- internally run various flavor of crash test
Reviewed By: ajkr
Differential Revision: D38847269
Pulled By: cbi42
fbshipit-source-id: 8b4402a9bba9f6cfa08051943cd672579d489599
2022-08-24 21:59:50 +00:00
|
|
|
// the range (current key, last key) was skipped
|
|
|
|
GetIntVal(iter->key().ToString(), &curr);
|
2023-08-14 21:57:28 +00:00
|
|
|
if (last_key <= static_cast<int64_t>(curr)) {
|
|
|
|
thread->shared->SetVerificationFailure();
|
|
|
|
fprintf(stderr,
|
2023-11-03 16:53:22 +00:00
|
|
|
"TestIterateAgainstExpected failed: found unexpectedly large "
|
|
|
|
"key\n");
|
2023-08-14 21:57:28 +00:00
|
|
|
fprintf(stderr, "Column family: %s, op_logs: %s\n",
|
|
|
|
cfh->GetName().c_str(), op_logs.c_str());
|
|
|
|
fprintf(stderr, "Last op found key: %s, expected at most: %s\n",
|
|
|
|
Slice(Key(curr)).ToString(true).c_str(),
|
|
|
|
Slice(Key(last_key - 1)).ToString(true).c_str());
|
|
|
|
thread->stats.AddErrors(1);
|
|
|
|
return Status::OK();
|
|
|
|
}
|
Add Iterator test against expected state to stress test (#10538)
Summary:
As mentioned in https://github.com/facebook/rocksdb/pull/5506#issuecomment-506021913,
`db_stress` does not have much verification for iterator correctness.
It has a `TestIterate()` function, but that is mainly for comparing results
between two iterators, one with `total_order_seek` and the other optionally
sets auto_prefix, upper/lower bounds. Commit 49a0581ad2462e31aa3f768afa769e0d33390f33
added a new `TestIterateAgainstExpected()` function that compares iterator against
expected state. It locks a range of keys, creates an iterator, does
a random sequence of `Next/Prev` and compares against expected state.
This PR is based on that commit, the main changes include some logs
(for easier debugging if a test fails), a forward and backward scan to
cover the entire locked key range, and a flag for optionally turning on
this version of Iterator testing.
Added constraint that the checks against expected state in
`TestIterateAgainstExpected()` and in `TestGet()` are only turned on
when `--skip_verifydb` flag is not set.
Remove the change log introduced in https://github.com/facebook/rocksdb/issues/10553.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10538
Test Plan:
Run `db_stress` with `--verify_iterator_with_expected_state_one_in=1`,
and a large `--iterpercent` and `--num_iterations`. Checked `op_logs`
manually to ensure expected coverage. Tweaked part of the code in
https://github.com/facebook/rocksdb/issues/10449 and stress test was able to catch it.
- internally run various flavor of crash test
Reviewed By: ajkr
Differential Revision: D38847269
Pulled By: cbi42
fbshipit-source-id: 8b4402a9bba9f6cfa08051943cd672579d489599
2022-08-24 21:59:50 +00:00
|
|
|
if (!check_no_key_in_range(static_cast<int64_t>(curr + 1), last_key)) {
|
|
|
|
return Status::OK();
|
|
|
|
}
|
2022-10-14 21:25:05 +00:00
|
|
|
|
Add Iterator test against expected state to stress test (#10538)
Summary:
As mentioned in https://github.com/facebook/rocksdb/pull/5506#issuecomment-506021913,
`db_stress` does not have much verification for iterator correctness.
It has a `TestIterate()` function, but that is mainly for comparing results
between two iterators, one with `total_order_seek` and the other optionally
sets auto_prefix, upper/lower bounds. Commit 49a0581ad2462e31aa3f768afa769e0d33390f33
added a new `TestIterateAgainstExpected()` function that compares iterator against
expected state. It locks a range of keys, creates an iterator, does
a random sequence of `Next/Prev` and compares against expected state.
This PR is based on that commit, the main changes include some logs
(for easier debugging if a test fails), a forward and backward scan to
cover the entire locked key range, and a flag for optionally turning on
this version of Iterator testing.
Added constraint that the checks against expected state in
`TestIterateAgainstExpected()` and in `TestGet()` are only turned on
when `--skip_verifydb` flag is not set.
Remove the change log introduced in https://github.com/facebook/rocksdb/issues/10553.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10538
Test Plan:
Run `db_stress` with `--verify_iterator_with_expected_state_one_in=1`,
and a large `--iterpercent` and `--num_iterations`. Checked `op_logs`
manually to ensure expected coverage. Tweaked part of the code in
https://github.com/facebook/rocksdb/issues/10449 and stress test was able to catch it.
- internally run various flavor of crash test
Reviewed By: ajkr
Differential Revision: D38847269
Pulled By: cbi42
fbshipit-source-id: 8b4402a9bba9f6cfa08051943cd672579d489599
2022-08-24 21:59:50 +00:00
|
|
|
last_key = static_cast<int64_t>(curr);
|
|
|
|
if (last_key <= lb) {
|
|
|
|
break;
|
|
|
|
}
|
2022-10-14 21:25:05 +00:00
|
|
|
|
Add Iterator test against expected state to stress test (#10538)
Summary:
As mentioned in https://github.com/facebook/rocksdb/pull/5506#issuecomment-506021913,
`db_stress` does not have much verification for iterator correctness.
It has a `TestIterate()` function, but that is mainly for comparing results
between two iterators, one with `total_order_seek` and the other optionally
sets auto_prefix, upper/lower bounds. Commit 49a0581ad2462e31aa3f768afa769e0d33390f33
added a new `TestIterateAgainstExpected()` function that compares iterator against
expected state. It locks a range of keys, creates an iterator, does
a random sequence of `Next/Prev` and compares against expected state.
This PR is based on that commit, the main changes include some logs
(for easier debugging if a test fails), a forward and backward scan to
cover the entire locked key range, and a flag for optionally turning on
this version of Iterator testing.
Added constraint that the checks against expected state in
`TestIterateAgainstExpected()` and in `TestGet()` are only turned on
when `--skip_verifydb` flag is not set.
Remove the change log introduced in https://github.com/facebook/rocksdb/issues/10553.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10538
Test Plan:
Run `db_stress` with `--verify_iterator_with_expected_state_one_in=1`,
and a large `--iterpercent` and `--num_iterations`. Checked `op_logs`
manually to ensure expected coverage. Tweaked part of the code in
https://github.com/facebook/rocksdb/issues/10449 and stress test was able to catch it.
- internally run various flavor of crash test
Reviewed By: ajkr
Differential Revision: D38847269
Pulled By: cbi42
fbshipit-source-id: 8b4402a9bba9f6cfa08051943cd672579d489599
2022-08-24 21:59:50 +00:00
|
|
|
iter->Prev();
|
2022-10-14 21:25:05 +00:00
|
|
|
|
Add Iterator test against expected state to stress test (#10538)
Summary:
As mentioned in https://github.com/facebook/rocksdb/pull/5506#issuecomment-506021913,
`db_stress` does not have much verification for iterator correctness.
It has a `TestIterate()` function, but that is mainly for comparing results
between two iterators, one with `total_order_seek` and the other optionally
sets auto_prefix, upper/lower bounds. Commit 49a0581ad2462e31aa3f768afa769e0d33390f33
added a new `TestIterateAgainstExpected()` function that compares iterator against
expected state. It locks a range of keys, creates an iterator, does
a random sequence of `Next/Prev` and compares against expected state.
This PR is based on that commit, the main changes include some logs
(for easier debugging if a test fails), a forward and backward scan to
cover the entire locked key range, and a flag for optionally turning on
this version of Iterator testing.
Added constraint that the checks against expected state in
`TestIterateAgainstExpected()` and in `TestGet()` are only turned on
when `--skip_verifydb` flag is not set.
Remove the change log introduced in https://github.com/facebook/rocksdb/issues/10553.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10538
Test Plan:
Run `db_stress` with `--verify_iterator_with_expected_state_one_in=1`,
and a large `--iterpercent` and `--num_iterations`. Checked `op_logs`
manually to ensure expected coverage. Tweaked part of the code in
https://github.com/facebook/rocksdb/issues/10449 and stress test was able to catch it.
- internally run various flavor of crash test
Reviewed By: ajkr
Differential Revision: D38847269
Pulled By: cbi42
fbshipit-source-id: 8b4402a9bba9f6cfa08051943cd672579d489599
2022-08-24 21:59:50 +00:00
|
|
|
op_logs += "P";
|
|
|
|
}
|
|
|
|
|
2024-06-01 16:52:48 +00:00
|
|
|
// Write-prepared/write-unprepared transactions and multi-CF iterator do not
|
|
|
|
// support Refresh() yet.
|
2023-11-03 23:27:11 +00:00
|
|
|
if (!(FLAGS_use_txn && FLAGS_txn_write_policy != 0) &&
|
2024-06-01 16:52:48 +00:00
|
|
|
!FLAGS_use_multi_cf_iterator && thread->rand.OneIn(2)) {
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
pre_read_expected_values.clear();
|
|
|
|
post_read_expected_values.clear();
|
2022-10-03 23:22:39 +00:00
|
|
|
// Refresh after forward/backward scan to allow higher chance of SV
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
// change.
|
|
|
|
for (int64_t i = 0; i < static_cast<int64_t>(expected_values_size); ++i) {
|
|
|
|
pre_read_expected_values.push_back(
|
|
|
|
shared->Get(rand_column_family, i + lb));
|
|
|
|
}
|
2023-11-03 23:27:11 +00:00
|
|
|
Status rs = iter->Refresh();
|
2024-06-25 03:51:39 +00:00
|
|
|
if (!rs.ok() && IsErrorInjectedAndRetryable(rs)) {
|
Inject more errors to more files in stress test (#12713)
Summary:
**Context:**
We currently have partial error injection:
- DB operation: all read, SST write
- DB open: all read, SST write, all metadata write.
This PR completes the error injection (with some limitations below):
- DB operation & open: all read, all write, all metadata write, all metadata read
**Summary:**
- Inject retryable metadata read, metadata write error concerning directory (e.g, dir sync, ) or file metadata (e.g, name, size, file creation/deletion...)
- Inject retryable errors to all major file types: random access file, sequential file, writable file
- Allow db stress test operations to handle above injected errors gracefully without crashing
- Change all error injection to thread-local implementation for easier disabling and enabling in the same thread. For example, we can control error handling thread to have no error injection. It's also cleaner in code.
- Limitation: compared to before, we now don't have write fault injection for backup/restore CopyOrCreateFiles work threads since they use anonymous background threads as well as read injection for db open bg thread
- Add a new flag to test error recovery without error injection so we can test the path where error recovery actually succeeds
- Some Refactory & fix to db stress test framework (see PR review comments)
- Fix some minor bugs surfaced (see PR review comments)
- Limitation: had to disable backup restore with metadata read/write injection since it surfaces too many testing issues. Will add it back later to focus on surfacing actual code/internal bugs first.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12713
Test Plan:
- Existing UT
- CI with no trivial error failure
Reviewed By: pdillinger
Differential Revision: D58326608
Pulled By: hx235
fbshipit-source-id: 011b5195aaeb6011641ae0a9194f7f2a0e325ad7
2024-06-19 15:42:00 +00:00
|
|
|
return rs;
|
|
|
|
}
|
2023-11-03 23:27:11 +00:00
|
|
|
assert(rs.ok());
|
|
|
|
op_logs += "Refresh ";
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
for (int64_t i = 0; i < static_cast<int64_t>(expected_values_size); ++i) {
|
|
|
|
post_read_expected_values.push_back(
|
|
|
|
shared->Get(rand_column_family, i + lb));
|
|
|
|
}
|
|
|
|
|
|
|
|
assert(pre_read_expected_values.size() == expected_values_size &&
|
|
|
|
pre_read_expected_values.size() ==
|
|
|
|
post_read_expected_values.size());
|
2022-10-03 23:22:39 +00:00
|
|
|
}
|
2022-10-14 21:25:05 +00:00
|
|
|
|
Add Iterator test against expected state to stress test (#10538)
Summary:
As mentioned in https://github.com/facebook/rocksdb/pull/5506#issuecomment-506021913,
`db_stress` does not have much verification for iterator correctness.
It has a `TestIterate()` function, but that is mainly for comparing results
between two iterators, one with `total_order_seek` and the other optionally
sets auto_prefix, upper/lower bounds. Commit 49a0581ad2462e31aa3f768afa769e0d33390f33
added a new `TestIterateAgainstExpected()` function that compares iterator against
expected state. It locks a range of keys, creates an iterator, does
a random sequence of `Next/Prev` and compares against expected state.
This PR is based on that commit, the main changes include some logs
(for easier debugging if a test fails), a forward and backward scan to
cover the entire locked key range, and a flag for optionally turning on
this version of Iterator testing.
Added constraint that the checks against expected state in
`TestIterateAgainstExpected()` and in `TestGet()` are only turned on
when `--skip_verifydb` flag is not set.
Remove the change log introduced in https://github.com/facebook/rocksdb/issues/10553.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10538
Test Plan:
Run `db_stress` with `--verify_iterator_with_expected_state_one_in=1`,
and a large `--iterpercent` and `--num_iterations`. Checked `op_logs`
manually to ensure expected coverage. Tweaked part of the code in
https://github.com/facebook/rocksdb/issues/10449 and stress test was able to catch it.
- internally run various flavor of crash test
Reviewed By: ajkr
Differential Revision: D38847269
Pulled By: cbi42
fbshipit-source-id: 8b4402a9bba9f6cfa08051943cd672579d489599
2022-08-24 21:59:50 +00:00
|
|
|
// start from middle of [lb, ub) otherwise it is easy to iterate out of
|
|
|
|
// locked range
|
2022-10-14 21:25:05 +00:00
|
|
|
const int64_t mid = lb + num_iter / 2;
|
|
|
|
|
Add Iterator test against expected state to stress test (#10538)
Summary:
As mentioned in https://github.com/facebook/rocksdb/pull/5506#issuecomment-506021913,
`db_stress` does not have much verification for iterator correctness.
It has a `TestIterate()` function, but that is mainly for comparing results
between two iterators, one with `total_order_seek` and the other optionally
sets auto_prefix, upper/lower bounds. Commit 49a0581ad2462e31aa3f768afa769e0d33390f33
added a new `TestIterateAgainstExpected()` function that compares iterator against
expected state. It locks a range of keys, creates an iterator, does
a random sequence of `Next/Prev` and compares against expected state.
This PR is based on that commit, the main changes include some logs
(for easier debugging if a test fails), a forward and backward scan to
cover the entire locked key range, and a flag for optionally turning on
this version of Iterator testing.
Added constraint that the checks against expected state in
`TestIterateAgainstExpected()` and in `TestGet()` are only turned on
when `--skip_verifydb` flag is not set.
Remove the change log introduced in https://github.com/facebook/rocksdb/issues/10553.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10538
Test Plan:
Run `db_stress` with `--verify_iterator_with_expected_state_one_in=1`,
and a large `--iterpercent` and `--num_iterations`. Checked `op_logs`
manually to ensure expected coverage. Tweaked part of the code in
https://github.com/facebook/rocksdb/issues/10449 and stress test was able to catch it.
- internally run various flavor of crash test
Reviewed By: ajkr
Differential Revision: D38847269
Pulled By: cbi42
fbshipit-source-id: 8b4402a9bba9f6cfa08051943cd672579d489599
2022-08-24 21:59:50 +00:00
|
|
|
key_str = Key(mid);
|
2022-10-14 21:25:05 +00:00
|
|
|
const Slice key(key_str);
|
|
|
|
|
Add Iterator test against expected state to stress test (#10538)
Summary:
As mentioned in https://github.com/facebook/rocksdb/pull/5506#issuecomment-506021913,
`db_stress` does not have much verification for iterator correctness.
It has a `TestIterate()` function, but that is mainly for comparing results
between two iterators, one with `total_order_seek` and the other optionally
sets auto_prefix, upper/lower bounds. Commit 49a0581ad2462e31aa3f768afa769e0d33390f33
added a new `TestIterateAgainstExpected()` function that compares iterator against
expected state. It locks a range of keys, creates an iterator, does
a random sequence of `Next/Prev` and compares against expected state.
This PR is based on that commit, the main changes include some logs
(for easier debugging if a test fails), a forward and backward scan to
cover the entire locked key range, and a flag for optionally turning on
this version of Iterator testing.
Added constraint that the checks against expected state in
`TestIterateAgainstExpected()` and in `TestGet()` are only turned on
when `--skip_verifydb` flag is not set.
Remove the change log introduced in https://github.com/facebook/rocksdb/issues/10553.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10538
Test Plan:
Run `db_stress` with `--verify_iterator_with_expected_state_one_in=1`,
and a large `--iterpercent` and `--num_iterations`. Checked `op_logs`
manually to ensure expected coverage. Tweaked part of the code in
https://github.com/facebook/rocksdb/issues/10449 and stress test was able to catch it.
- internally run various flavor of crash test
Reviewed By: ajkr
Differential Revision: D38847269
Pulled By: cbi42
fbshipit-source-id: 8b4402a9bba9f6cfa08051943cd672579d489599
2022-08-24 21:59:50 +00:00
|
|
|
if (thread->rand.OneIn(2)) {
|
|
|
|
iter->Seek(key);
|
|
|
|
op_logs += " S " + key.ToString(true) + " ";
|
|
|
|
if (!iter->Valid() && iter->status().ok()) {
|
|
|
|
if (!check_no_key_in_range(mid, ub)) {
|
|
|
|
return Status::OK();
|
|
|
|
}
|
2023-08-14 21:57:28 +00:00
|
|
|
} else if (iter->Valid()) {
|
|
|
|
GetIntVal(iter->key().ToString(), &curr);
|
|
|
|
if (static_cast<int64_t>(curr) < mid) {
|
|
|
|
thread->shared->SetVerificationFailure();
|
|
|
|
fprintf(stderr,
|
2023-11-03 16:53:22 +00:00
|
|
|
"TestIterateAgainstExpected failed: found unexpectedly small "
|
|
|
|
"key\n");
|
2023-08-14 21:57:28 +00:00
|
|
|
fprintf(stderr, "Column family: %s, op_logs: %s\n",
|
|
|
|
cfh->GetName().c_str(), op_logs.c_str());
|
|
|
|
fprintf(stderr, "Last op found key: %s, expected at least: %s\n",
|
|
|
|
Slice(Key(curr)).ToString(true).c_str(),
|
|
|
|
Slice(Key(mid)).ToString(true).c_str());
|
|
|
|
thread->stats.AddErrors(1);
|
|
|
|
return Status::OK();
|
|
|
|
}
|
Add Iterator test against expected state to stress test (#10538)
Summary:
As mentioned in https://github.com/facebook/rocksdb/pull/5506#issuecomment-506021913,
`db_stress` does not have much verification for iterator correctness.
It has a `TestIterate()` function, but that is mainly for comparing results
between two iterators, one with `total_order_seek` and the other optionally
sets auto_prefix, upper/lower bounds. Commit 49a0581ad2462e31aa3f768afa769e0d33390f33
added a new `TestIterateAgainstExpected()` function that compares iterator against
expected state. It locks a range of keys, creates an iterator, does
a random sequence of `Next/Prev` and compares against expected state.
This PR is based on that commit, the main changes include some logs
(for easier debugging if a test fails), a forward and backward scan to
cover the entire locked key range, and a flag for optionally turning on
this version of Iterator testing.
Added constraint that the checks against expected state in
`TestIterateAgainstExpected()` and in `TestGet()` are only turned on
when `--skip_verifydb` flag is not set.
Remove the change log introduced in https://github.com/facebook/rocksdb/issues/10553.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10538
Test Plan:
Run `db_stress` with `--verify_iterator_with_expected_state_one_in=1`,
and a large `--iterpercent` and `--num_iterations`. Checked `op_logs`
manually to ensure expected coverage. Tweaked part of the code in
https://github.com/facebook/rocksdb/issues/10449 and stress test was able to catch it.
- internally run various flavor of crash test
Reviewed By: ajkr
Differential Revision: D38847269
Pulled By: cbi42
fbshipit-source-id: 8b4402a9bba9f6cfa08051943cd672579d489599
2022-08-24 21:59:50 +00:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
iter->SeekForPrev(key);
|
|
|
|
op_logs += " SFP " + key.ToString(true) + " ";
|
|
|
|
if (!iter->Valid() && iter->status().ok()) {
|
|
|
|
// iterator says nothing <= mid
|
|
|
|
if (!check_no_key_in_range(lb, mid + 1)) {
|
|
|
|
return Status::OK();
|
|
|
|
}
|
2023-08-14 21:57:28 +00:00
|
|
|
} else if (iter->Valid()) {
|
|
|
|
GetIntVal(iter->key().ToString(), &curr);
|
|
|
|
if (mid < static_cast<int64_t>(curr)) {
|
|
|
|
thread->shared->SetVerificationFailure();
|
|
|
|
fprintf(stderr,
|
2023-11-03 16:53:22 +00:00
|
|
|
"TestIterateAgainstExpected failed: found unexpectedly large "
|
|
|
|
"key\n");
|
2023-08-14 21:57:28 +00:00
|
|
|
fprintf(stderr, "Column family: %s, op_logs: %s\n",
|
|
|
|
cfh->GetName().c_str(), op_logs.c_str());
|
|
|
|
fprintf(stderr, "Last op found key: %s, expected at most: %s\n",
|
|
|
|
Slice(Key(curr)).ToString(true).c_str(),
|
|
|
|
Slice(Key(mid)).ToString(true).c_str());
|
|
|
|
thread->stats.AddErrors(1);
|
|
|
|
return Status::OK();
|
|
|
|
}
|
Add Iterator test against expected state to stress test (#10538)
Summary:
As mentioned in https://github.com/facebook/rocksdb/pull/5506#issuecomment-506021913,
`db_stress` does not have much verification for iterator correctness.
It has a `TestIterate()` function, but that is mainly for comparing results
between two iterators, one with `total_order_seek` and the other optionally
sets auto_prefix, upper/lower bounds. Commit 49a0581ad2462e31aa3f768afa769e0d33390f33
added a new `TestIterateAgainstExpected()` function that compares iterator against
expected state. It locks a range of keys, creates an iterator, does
a random sequence of `Next/Prev` and compares against expected state.
This PR is based on that commit, the main changes include some logs
(for easier debugging if a test fails), a forward and backward scan to
cover the entire locked key range, and a flag for optionally turning on
this version of Iterator testing.
Added constraint that the checks against expected state in
`TestIterateAgainstExpected()` and in `TestGet()` are only turned on
when `--skip_verifydb` flag is not set.
Remove the change log introduced in https://github.com/facebook/rocksdb/issues/10553.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10538
Test Plan:
Run `db_stress` with `--verify_iterator_with_expected_state_one_in=1`,
and a large `--iterpercent` and `--num_iterations`. Checked `op_logs`
manually to ensure expected coverage. Tweaked part of the code in
https://github.com/facebook/rocksdb/issues/10449 and stress test was able to catch it.
- internally run various flavor of crash test
Reviewed By: ajkr
Differential Revision: D38847269
Pulled By: cbi42
fbshipit-source-id: 8b4402a9bba9f6cfa08051943cd672579d489599
2022-08-24 21:59:50 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-10-14 21:25:05 +00:00
|
|
|
for (int64_t i = 0; i < num_iter && iter->Valid(); ++i) {
|
2024-11-11 03:21:35 +00:00
|
|
|
if (ro.allow_unprepared_value) {
|
|
|
|
op_logs += "*";
|
|
|
|
|
|
|
|
if (!iter->PrepareValue()) {
|
|
|
|
assert(!iter->Valid());
|
|
|
|
assert(!iter->status().ok());
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-10-14 21:25:05 +00:00
|
|
|
if (!check_columns()) {
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
Add Iterator test against expected state to stress test (#10538)
Summary:
As mentioned in https://github.com/facebook/rocksdb/pull/5506#issuecomment-506021913,
`db_stress` does not have much verification for iterator correctness.
It has a `TestIterate()` function, but that is mainly for comparing results
between two iterators, one with `total_order_seek` and the other optionally
sets auto_prefix, upper/lower bounds. Commit 49a0581ad2462e31aa3f768afa769e0d33390f33
added a new `TestIterateAgainstExpected()` function that compares iterator against
expected state. It locks a range of keys, creates an iterator, does
a random sequence of `Next/Prev` and compares against expected state.
This PR is based on that commit, the main changes include some logs
(for easier debugging if a test fails), a forward and backward scan to
cover the entire locked key range, and a flag for optionally turning on
this version of Iterator testing.
Added constraint that the checks against expected state in
`TestIterateAgainstExpected()` and in `TestGet()` are only turned on
when `--skip_verifydb` flag is not set.
Remove the change log introduced in https://github.com/facebook/rocksdb/issues/10553.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10538
Test Plan:
Run `db_stress` with `--verify_iterator_with_expected_state_one_in=1`,
and a large `--iterpercent` and `--num_iterations`. Checked `op_logs`
manually to ensure expected coverage. Tweaked part of the code in
https://github.com/facebook/rocksdb/issues/10449 and stress test was able to catch it.
- internally run various flavor of crash test
Reviewed By: ajkr
Differential Revision: D38847269
Pulled By: cbi42
fbshipit-source-id: 8b4402a9bba9f6cfa08051943cd672579d489599
2022-08-24 21:59:50 +00:00
|
|
|
GetIntVal(iter->key().ToString(), &curr);
|
2022-10-14 21:25:05 +00:00
|
|
|
if (static_cast<int64_t>(curr) < lb) {
|
Add Iterator test against expected state to stress test (#10538)
Summary:
As mentioned in https://github.com/facebook/rocksdb/pull/5506#issuecomment-506021913,
`db_stress` does not have much verification for iterator correctness.
It has a `TestIterate()` function, but that is mainly for comparing results
between two iterators, one with `total_order_seek` and the other optionally
sets auto_prefix, upper/lower bounds. Commit 49a0581ad2462e31aa3f768afa769e0d33390f33
added a new `TestIterateAgainstExpected()` function that compares iterator against
expected state. It locks a range of keys, creates an iterator, does
a random sequence of `Next/Prev` and compares against expected state.
This PR is based on that commit, the main changes include some logs
(for easier debugging if a test fails), a forward and backward scan to
cover the entire locked key range, and a flag for optionally turning on
this version of Iterator testing.
Added constraint that the checks against expected state in
`TestIterateAgainstExpected()` and in `TestGet()` are only turned on
when `--skip_verifydb` flag is not set.
Remove the change log introduced in https://github.com/facebook/rocksdb/issues/10553.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10538
Test Plan:
Run `db_stress` with `--verify_iterator_with_expected_state_one_in=1`,
and a large `--iterpercent` and `--num_iterations`. Checked `op_logs`
manually to ensure expected coverage. Tweaked part of the code in
https://github.com/facebook/rocksdb/issues/10449 and stress test was able to catch it.
- internally run various flavor of crash test
Reviewed By: ajkr
Differential Revision: D38847269
Pulled By: cbi42
fbshipit-source-id: 8b4402a9bba9f6cfa08051943cd672579d489599
2022-08-24 21:59:50 +00:00
|
|
|
iter->Next();
|
|
|
|
op_logs += "N";
|
2023-10-03 00:47:24 +00:00
|
|
|
} else if (static_cast<int64_t>(curr) >= ub) {
|
Add Iterator test against expected state to stress test (#10538)
Summary:
As mentioned in https://github.com/facebook/rocksdb/pull/5506#issuecomment-506021913,
`db_stress` does not have much verification for iterator correctness.
It has a `TestIterate()` function, but that is mainly for comparing results
between two iterators, one with `total_order_seek` and the other optionally
sets auto_prefix, upper/lower bounds. Commit 49a0581ad2462e31aa3f768afa769e0d33390f33
added a new `TestIterateAgainstExpected()` function that compares iterator against
expected state. It locks a range of keys, creates an iterator, does
a random sequence of `Next/Prev` and compares against expected state.
This PR is based on that commit, the main changes include some logs
(for easier debugging if a test fails), a forward and backward scan to
cover the entire locked key range, and a flag for optionally turning on
this version of Iterator testing.
Added constraint that the checks against expected state in
`TestIterateAgainstExpected()` and in `TestGet()` are only turned on
when `--skip_verifydb` flag is not set.
Remove the change log introduced in https://github.com/facebook/rocksdb/issues/10553.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10538
Test Plan:
Run `db_stress` with `--verify_iterator_with_expected_state_one_in=1`,
and a large `--iterpercent` and `--num_iterations`. Checked `op_logs`
manually to ensure expected coverage. Tweaked part of the code in
https://github.com/facebook/rocksdb/issues/10449 and stress test was able to catch it.
- internally run various flavor of crash test
Reviewed By: ajkr
Differential Revision: D38847269
Pulled By: cbi42
fbshipit-source-id: 8b4402a9bba9f6cfa08051943cd672579d489599
2022-08-24 21:59:50 +00:00
|
|
|
iter->Prev();
|
|
|
|
op_logs += "P";
|
|
|
|
} else {
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
const uint32_t value_base_from_db = GetValueBase(iter->value());
|
|
|
|
std::size_t index = static_cast<std::size_t>(curr - lb);
|
|
|
|
assert(index < pre_read_expected_values.size() &&
|
|
|
|
index < post_read_expected_values.size());
|
|
|
|
const ExpectedValue pre_read_expected_value =
|
|
|
|
pre_read_expected_values[index];
|
|
|
|
const ExpectedValue post_read_expected_value =
|
|
|
|
post_read_expected_values[index];
|
|
|
|
if (ExpectedValueHelper::MustHaveNotExisted(pre_read_expected_value,
|
|
|
|
post_read_expected_value) ||
|
|
|
|
!ExpectedValueHelper::InExpectedValueBaseRange(
|
|
|
|
value_base_from_db, pre_read_expected_value,
|
|
|
|
post_read_expected_value)) {
|
Add Iterator test against expected state to stress test (#10538)
Summary:
As mentioned in https://github.com/facebook/rocksdb/pull/5506#issuecomment-506021913,
`db_stress` does not have much verification for iterator correctness.
It has a `TestIterate()` function, but that is mainly for comparing results
between two iterators, one with `total_order_seek` and the other optionally
sets auto_prefix, upper/lower bounds. Commit 49a0581ad2462e31aa3f768afa769e0d33390f33
added a new `TestIterateAgainstExpected()` function that compares iterator against
expected state. It locks a range of keys, creates an iterator, does
a random sequence of `Next/Prev` and compares against expected state.
This PR is based on that commit, the main changes include some logs
(for easier debugging if a test fails), a forward and backward scan to
cover the entire locked key range, and a flag for optionally turning on
this version of Iterator testing.
Added constraint that the checks against expected state in
`TestIterateAgainstExpected()` and in `TestGet()` are only turned on
when `--skip_verifydb` flag is not set.
Remove the change log introduced in https://github.com/facebook/rocksdb/issues/10553.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10538
Test Plan:
Run `db_stress` with `--verify_iterator_with_expected_state_one_in=1`,
and a large `--iterpercent` and `--num_iterations`. Checked `op_logs`
manually to ensure expected coverage. Tweaked part of the code in
https://github.com/facebook/rocksdb/issues/10449 and stress test was able to catch it.
- internally run various flavor of crash test
Reviewed By: ajkr
Differential Revision: D38847269
Pulled By: cbi42
fbshipit-source-id: 8b4402a9bba9f6cfa08051943cd672579d489599
2022-08-24 21:59:50 +00:00
|
|
|
// Fail fast to preserve the DB state.
|
|
|
|
thread->shared->SetVerificationFailure();
|
2023-11-03 16:53:22 +00:00
|
|
|
fprintf(stderr,
|
|
|
|
"Verification failed: iterator has key %s, but expected "
|
|
|
|
"state does not.\n",
|
Add Iterator test against expected state to stress test (#10538)
Summary:
As mentioned in https://github.com/facebook/rocksdb/pull/5506#issuecomment-506021913,
`db_stress` does not have much verification for iterator correctness.
It has a `TestIterate()` function, but that is mainly for comparing results
between two iterators, one with `total_order_seek` and the other optionally
sets auto_prefix, upper/lower bounds. Commit 49a0581ad2462e31aa3f768afa769e0d33390f33
added a new `TestIterateAgainstExpected()` function that compares iterator against
expected state. It locks a range of keys, creates an iterator, does
a random sequence of `Next/Prev` and compares against expected state.
This PR is based on that commit, the main changes include some logs
(for easier debugging if a test fails), a forward and backward scan to
cover the entire locked key range, and a flag for optionally turning on
this version of Iterator testing.
Added constraint that the checks against expected state in
`TestIterateAgainstExpected()` and in `TestGet()` are only turned on
when `--skip_verifydb` flag is not set.
Remove the change log introduced in https://github.com/facebook/rocksdb/issues/10553.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10538
Test Plan:
Run `db_stress` with `--verify_iterator_with_expected_state_one_in=1`,
and a large `--iterpercent` and `--num_iterations`. Checked `op_logs`
manually to ensure expected coverage. Tweaked part of the code in
https://github.com/facebook/rocksdb/issues/10449 and stress test was able to catch it.
- internally run various flavor of crash test
Reviewed By: ajkr
Differential Revision: D38847269
Pulled By: cbi42
fbshipit-source-id: 8b4402a9bba9f6cfa08051943cd672579d489599
2022-08-24 21:59:50 +00:00
|
|
|
iter->key().ToString(true).c_str());
|
|
|
|
fprintf(stderr, "Column family: %s, op_logs: %s\n",
|
|
|
|
cfh->GetName().c_str(), op_logs.c_str());
|
|
|
|
thread->stats.AddErrors(1);
|
|
|
|
break;
|
|
|
|
}
|
2022-10-14 21:25:05 +00:00
|
|
|
|
Add Iterator test against expected state to stress test (#10538)
Summary:
As mentioned in https://github.com/facebook/rocksdb/pull/5506#issuecomment-506021913,
`db_stress` does not have much verification for iterator correctness.
It has a `TestIterate()` function, but that is mainly for comparing results
between two iterators, one with `total_order_seek` and the other optionally
sets auto_prefix, upper/lower bounds. Commit 49a0581ad2462e31aa3f768afa769e0d33390f33
added a new `TestIterateAgainstExpected()` function that compares iterator against
expected state. It locks a range of keys, creates an iterator, does
a random sequence of `Next/Prev` and compares against expected state.
This PR is based on that commit, the main changes include some logs
(for easier debugging if a test fails), a forward and backward scan to
cover the entire locked key range, and a flag for optionally turning on
this version of Iterator testing.
Added constraint that the checks against expected state in
`TestIterateAgainstExpected()` and in `TestGet()` are only turned on
when `--skip_verifydb` flag is not set.
Remove the change log introduced in https://github.com/facebook/rocksdb/issues/10553.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10538
Test Plan:
Run `db_stress` with `--verify_iterator_with_expected_state_one_in=1`,
and a large `--iterpercent` and `--num_iterations`. Checked `op_logs`
manually to ensure expected coverage. Tweaked part of the code in
https://github.com/facebook/rocksdb/issues/10449 and stress test was able to catch it.
- internally run various flavor of crash test
Reviewed By: ajkr
Differential Revision: D38847269
Pulled By: cbi42
fbshipit-source-id: 8b4402a9bba9f6cfa08051943cd672579d489599
2022-08-24 21:59:50 +00:00
|
|
|
if (thread->rand.OneIn(2)) {
|
|
|
|
iter->Next();
|
|
|
|
op_logs += "N";
|
|
|
|
if (!iter->Valid()) {
|
|
|
|
break;
|
|
|
|
}
|
2022-10-14 21:25:05 +00:00
|
|
|
uint64_t next = 0;
|
Add Iterator test against expected state to stress test (#10538)
Summary:
As mentioned in https://github.com/facebook/rocksdb/pull/5506#issuecomment-506021913,
`db_stress` does not have much verification for iterator correctness.
It has a `TestIterate()` function, but that is mainly for comparing results
between two iterators, one with `total_order_seek` and the other optionally
sets auto_prefix, upper/lower bounds. Commit 49a0581ad2462e31aa3f768afa769e0d33390f33
added a new `TestIterateAgainstExpected()` function that compares iterator against
expected state. It locks a range of keys, creates an iterator, does
a random sequence of `Next/Prev` and compares against expected state.
This PR is based on that commit, the main changes include some logs
(for easier debugging if a test fails), a forward and backward scan to
cover the entire locked key range, and a flag for optionally turning on
this version of Iterator testing.
Added constraint that the checks against expected state in
`TestIterateAgainstExpected()` and in `TestGet()` are only turned on
when `--skip_verifydb` flag is not set.
Remove the change log introduced in https://github.com/facebook/rocksdb/issues/10553.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10538
Test Plan:
Run `db_stress` with `--verify_iterator_with_expected_state_one_in=1`,
and a large `--iterpercent` and `--num_iterations`. Checked `op_logs`
manually to ensure expected coverage. Tweaked part of the code in
https://github.com/facebook/rocksdb/issues/10449 and stress test was able to catch it.
- internally run various flavor of crash test
Reviewed By: ajkr
Differential Revision: D38847269
Pulled By: cbi42
fbshipit-source-id: 8b4402a9bba9f6cfa08051943cd672579d489599
2022-08-24 21:59:50 +00:00
|
|
|
GetIntVal(iter->key().ToString(), &next);
|
2023-08-14 21:57:28 +00:00
|
|
|
if (next <= curr) {
|
|
|
|
thread->shared->SetVerificationFailure();
|
2023-11-03 16:53:22 +00:00
|
|
|
fprintf(stderr,
|
|
|
|
"TestIterateAgainstExpected failed: found unexpectedly "
|
|
|
|
"small key\n");
|
2023-08-14 21:57:28 +00:00
|
|
|
fprintf(stderr, "Column family: %s, op_logs: %s\n",
|
|
|
|
cfh->GetName().c_str(), op_logs.c_str());
|
|
|
|
fprintf(stderr, "Last op found key: %s, expected at least: %s\n",
|
|
|
|
Slice(Key(next)).ToString(true).c_str(),
|
|
|
|
Slice(Key(curr + 1)).ToString(true).c_str());
|
|
|
|
thread->stats.AddErrors(1);
|
|
|
|
return Status::OK();
|
|
|
|
}
|
Add Iterator test against expected state to stress test (#10538)
Summary:
As mentioned in https://github.com/facebook/rocksdb/pull/5506#issuecomment-506021913,
`db_stress` does not have much verification for iterator correctness.
It has a `TestIterate()` function, but that is mainly for comparing results
between two iterators, one with `total_order_seek` and the other optionally
sets auto_prefix, upper/lower bounds. Commit 49a0581ad2462e31aa3f768afa769e0d33390f33
added a new `TestIterateAgainstExpected()` function that compares iterator against
expected state. It locks a range of keys, creates an iterator, does
a random sequence of `Next/Prev` and compares against expected state.
This PR is based on that commit, the main changes include some logs
(for easier debugging if a test fails), a forward and backward scan to
cover the entire locked key range, and a flag for optionally turning on
this version of Iterator testing.
Added constraint that the checks against expected state in
`TestIterateAgainstExpected()` and in `TestGet()` are only turned on
when `--skip_verifydb` flag is not set.
Remove the change log introduced in https://github.com/facebook/rocksdb/issues/10553.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10538
Test Plan:
Run `db_stress` with `--verify_iterator_with_expected_state_one_in=1`,
and a large `--iterpercent` and `--num_iterations`. Checked `op_logs`
manually to ensure expected coverage. Tweaked part of the code in
https://github.com/facebook/rocksdb/issues/10449 and stress test was able to catch it.
- internally run various flavor of crash test
Reviewed By: ajkr
Differential Revision: D38847269
Pulled By: cbi42
fbshipit-source-id: 8b4402a9bba9f6cfa08051943cd672579d489599
2022-08-24 21:59:50 +00:00
|
|
|
if (!check_no_key_in_range(static_cast<int64_t>(curr + 1),
|
|
|
|
static_cast<int64_t>(next))) {
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
iter->Prev();
|
|
|
|
op_logs += "P";
|
|
|
|
if (!iter->Valid()) {
|
|
|
|
break;
|
|
|
|
}
|
2022-10-14 21:25:05 +00:00
|
|
|
uint64_t prev = 0;
|
Add Iterator test against expected state to stress test (#10538)
Summary:
As mentioned in https://github.com/facebook/rocksdb/pull/5506#issuecomment-506021913,
`db_stress` does not have much verification for iterator correctness.
It has a `TestIterate()` function, but that is mainly for comparing results
between two iterators, one with `total_order_seek` and the other optionally
sets auto_prefix, upper/lower bounds. Commit 49a0581ad2462e31aa3f768afa769e0d33390f33
added a new `TestIterateAgainstExpected()` function that compares iterator against
expected state. It locks a range of keys, creates an iterator, does
a random sequence of `Next/Prev` and compares against expected state.
This PR is based on that commit, the main changes include some logs
(for easier debugging if a test fails), a forward and backward scan to
cover the entire locked key range, and a flag for optionally turning on
this version of Iterator testing.
Added constraint that the checks against expected state in
`TestIterateAgainstExpected()` and in `TestGet()` are only turned on
when `--skip_verifydb` flag is not set.
Remove the change log introduced in https://github.com/facebook/rocksdb/issues/10553.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10538
Test Plan:
Run `db_stress` with `--verify_iterator_with_expected_state_one_in=1`,
and a large `--iterpercent` and `--num_iterations`. Checked `op_logs`
manually to ensure expected coverage. Tweaked part of the code in
https://github.com/facebook/rocksdb/issues/10449 and stress test was able to catch it.
- internally run various flavor of crash test
Reviewed By: ajkr
Differential Revision: D38847269
Pulled By: cbi42
fbshipit-source-id: 8b4402a9bba9f6cfa08051943cd672579d489599
2022-08-24 21:59:50 +00:00
|
|
|
GetIntVal(iter->key().ToString(), &prev);
|
2023-08-14 21:57:28 +00:00
|
|
|
if (curr <= prev) {
|
|
|
|
thread->shared->SetVerificationFailure();
|
2023-11-03 16:53:22 +00:00
|
|
|
fprintf(stderr,
|
|
|
|
"TestIterateAgainstExpected failed: found unexpectedly "
|
|
|
|
"large key\n");
|
2023-08-14 21:57:28 +00:00
|
|
|
fprintf(stderr, "Column family: %s, op_logs: %s\n",
|
|
|
|
cfh->GetName().c_str(), op_logs.c_str());
|
|
|
|
fprintf(stderr, "Last op found key: %s, expected at most: %s\n",
|
|
|
|
Slice(Key(prev)).ToString(true).c_str(),
|
|
|
|
Slice(Key(curr - 1)).ToString(true).c_str());
|
|
|
|
thread->stats.AddErrors(1);
|
|
|
|
return Status::OK();
|
|
|
|
}
|
Add Iterator test against expected state to stress test (#10538)
Summary:
As mentioned in https://github.com/facebook/rocksdb/pull/5506#issuecomment-506021913,
`db_stress` does not have much verification for iterator correctness.
It has a `TestIterate()` function, but that is mainly for comparing results
between two iterators, one with `total_order_seek` and the other optionally
sets auto_prefix, upper/lower bounds. Commit 49a0581ad2462e31aa3f768afa769e0d33390f33
added a new `TestIterateAgainstExpected()` function that compares iterator against
expected state. It locks a range of keys, creates an iterator, does
a random sequence of `Next/Prev` and compares against expected state.
This PR is based on that commit, the main changes include some logs
(for easier debugging if a test fails), a forward and backward scan to
cover the entire locked key range, and a flag for optionally turning on
this version of Iterator testing.
Added constraint that the checks against expected state in
`TestIterateAgainstExpected()` and in `TestGet()` are only turned on
when `--skip_verifydb` flag is not set.
Remove the change log introduced in https://github.com/facebook/rocksdb/issues/10553.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10538
Test Plan:
Run `db_stress` with `--verify_iterator_with_expected_state_one_in=1`,
and a large `--iterpercent` and `--num_iterations`. Checked `op_logs`
manually to ensure expected coverage. Tweaked part of the code in
https://github.com/facebook/rocksdb/issues/10449 and stress test was able to catch it.
- internally run various flavor of crash test
Reviewed By: ajkr
Differential Revision: D38847269
Pulled By: cbi42
fbshipit-source-id: 8b4402a9bba9f6cfa08051943cd672579d489599
2022-08-24 21:59:50 +00:00
|
|
|
if (!check_no_key_in_range(static_cast<int64_t>(prev + 1),
|
|
|
|
static_cast<int64_t>(curr))) {
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2022-10-14 21:25:05 +00:00
|
|
|
|
Add Iterator test against expected state to stress test (#10538)
Summary:
As mentioned in https://github.com/facebook/rocksdb/pull/5506#issuecomment-506021913,
`db_stress` does not have much verification for iterator correctness.
It has a `TestIterate()` function, but that is mainly for comparing results
between two iterators, one with `total_order_seek` and the other optionally
sets auto_prefix, upper/lower bounds. Commit 49a0581ad2462e31aa3f768afa769e0d33390f33
added a new `TestIterateAgainstExpected()` function that compares iterator against
expected state. It locks a range of keys, creates an iterator, does
a random sequence of `Next/Prev` and compares against expected state.
This PR is based on that commit, the main changes include some logs
(for easier debugging if a test fails), a forward and backward scan to
cover the entire locked key range, and a flag for optionally turning on
this version of Iterator testing.
Added constraint that the checks against expected state in
`TestIterateAgainstExpected()` and in `TestGet()` are only turned on
when `--skip_verifydb` flag is not set.
Remove the change log introduced in https://github.com/facebook/rocksdb/issues/10553.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10538
Test Plan:
Run `db_stress` with `--verify_iterator_with_expected_state_one_in=1`,
and a large `--iterpercent` and `--num_iterations`. Checked `op_logs`
manually to ensure expected coverage. Tweaked part of the code in
https://github.com/facebook/rocksdb/issues/10449 and stress test was able to catch it.
- internally run various flavor of crash test
Reviewed By: ajkr
Differential Revision: D38847269
Pulled By: cbi42
fbshipit-source-id: 8b4402a9bba9f6cfa08051943cd672579d489599
2022-08-24 21:59:50 +00:00
|
|
|
if (!iter->status().ok()) {
|
2024-06-25 03:51:39 +00:00
|
|
|
if (IsErrorInjectedAndRetryable(iter->status())) {
|
Inject more errors to more files in stress test (#12713)
Summary:
**Context:**
We currently have partial error injection:
- DB operation: all read, SST write
- DB open: all read, SST write, all metadata write.
This PR completes the error injection (with some limitations below):
- DB operation & open: all read, all write, all metadata write, all metadata read
**Summary:**
- Inject retryable metadata read, metadata write error concerning directory (e.g, dir sync, ) or file metadata (e.g, name, size, file creation/deletion...)
- Inject retryable errors to all major file types: random access file, sequential file, writable file
- Allow db stress test operations to handle above injected errors gracefully without crashing
- Change all error injection to thread-local implementation for easier disabling and enabling in the same thread. For example, we can control error handling thread to have no error injection. It's also cleaner in code.
- Limitation: compared to before, we now don't have write fault injection for backup/restore CopyOrCreateFiles work threads since they use anonymous background threads as well as read injection for db open bg thread
- Add a new flag to test error recovery without error injection so we can test the path where error recovery actually succeeds
- Some Refactory & fix to db stress test framework (see PR review comments)
- Fix some minor bugs surfaced (see PR review comments)
- Limitation: had to disable backup restore with metadata read/write injection since it surfaces too many testing issues. Will add it back later to focus on surfacing actual code/internal bugs first.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12713
Test Plan:
- Existing UT
- CI with no trivial error failure
Reviewed By: pdillinger
Differential Revision: D58326608
Pulled By: hx235
fbshipit-source-id: 011b5195aaeb6011641ae0a9194f7f2a0e325ad7
2024-06-19 15:42:00 +00:00
|
|
|
return iter->status();
|
|
|
|
} else {
|
|
|
|
thread->shared->SetVerificationFailure();
|
|
|
|
fprintf(stderr, "TestIterate against expected state error: %s\n",
|
|
|
|
iter->status().ToString().c_str());
|
|
|
|
fprintf(stderr, "Column family: %s, op_logs: %s\n",
|
|
|
|
cfh->GetName().c_str(), op_logs.c_str());
|
|
|
|
thread->stats.AddErrors(1);
|
|
|
|
return iter->status();
|
|
|
|
}
|
Add Iterator test against expected state to stress test (#10538)
Summary:
As mentioned in https://github.com/facebook/rocksdb/pull/5506#issuecomment-506021913,
`db_stress` does not have much verification for iterator correctness.
It has a `TestIterate()` function, but that is mainly for comparing results
between two iterators, one with `total_order_seek` and the other optionally
sets auto_prefix, upper/lower bounds. Commit 49a0581ad2462e31aa3f768afa769e0d33390f33
added a new `TestIterateAgainstExpected()` function that compares iterator against
expected state. It locks a range of keys, creates an iterator, does
a random sequence of `Next/Prev` and compares against expected state.
This PR is based on that commit, the main changes include some logs
(for easier debugging if a test fails), a forward and backward scan to
cover the entire locked key range, and a flag for optionally turning on
this version of Iterator testing.
Added constraint that the checks against expected state in
`TestIterateAgainstExpected()` and in `TestGet()` are only turned on
when `--skip_verifydb` flag is not set.
Remove the change log introduced in https://github.com/facebook/rocksdb/issues/10553.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10538
Test Plan:
Run `db_stress` with `--verify_iterator_with_expected_state_one_in=1`,
and a large `--iterpercent` and `--num_iterations`. Checked `op_logs`
manually to ensure expected coverage. Tweaked part of the code in
https://github.com/facebook/rocksdb/issues/10449 and stress test was able to catch it.
- internally run various flavor of crash test
Reviewed By: ajkr
Differential Revision: D38847269
Pulled By: cbi42
fbshipit-source-id: 8b4402a9bba9f6cfa08051943cd672579d489599
2022-08-24 21:59:50 +00:00
|
|
|
}
|
2022-10-14 21:25:05 +00:00
|
|
|
|
Add Iterator test against expected state to stress test (#10538)
Summary:
As mentioned in https://github.com/facebook/rocksdb/pull/5506#issuecomment-506021913,
`db_stress` does not have much verification for iterator correctness.
It has a `TestIterate()` function, but that is mainly for comparing results
between two iterators, one with `total_order_seek` and the other optionally
sets auto_prefix, upper/lower bounds. Commit 49a0581ad2462e31aa3f768afa769e0d33390f33
added a new `TestIterateAgainstExpected()` function that compares iterator against
expected state. It locks a range of keys, creates an iterator, does
a random sequence of `Next/Prev` and compares against expected state.
This PR is based on that commit, the main changes include some logs
(for easier debugging if a test fails), a forward and backward scan to
cover the entire locked key range, and a flag for optionally turning on
this version of Iterator testing.
Added constraint that the checks against expected state in
`TestIterateAgainstExpected()` and in `TestGet()` are only turned on
when `--skip_verifydb` flag is not set.
Remove the change log introduced in https://github.com/facebook/rocksdb/issues/10553.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10538
Test Plan:
Run `db_stress` with `--verify_iterator_with_expected_state_one_in=1`,
and a large `--iterpercent` and `--num_iterations`. Checked `op_logs`
manually to ensure expected coverage. Tweaked part of the code in
https://github.com/facebook/rocksdb/issues/10449 and stress test was able to catch it.
- internally run various flavor of crash test
Reviewed By: ajkr
Differential Revision: D38847269
Pulled By: cbi42
fbshipit-source-id: 8b4402a9bba9f6cfa08051943cd672579d489599
2022-08-24 21:59:50 +00:00
|
|
|
thread->stats.AddIterations(1);
|
2022-10-14 21:25:05 +00:00
|
|
|
|
Add Iterator test against expected state to stress test (#10538)
Summary:
As mentioned in https://github.com/facebook/rocksdb/pull/5506#issuecomment-506021913,
`db_stress` does not have much verification for iterator correctness.
It has a `TestIterate()` function, but that is mainly for comparing results
between two iterators, one with `total_order_seek` and the other optionally
sets auto_prefix, upper/lower bounds. Commit 49a0581ad2462e31aa3f768afa769e0d33390f33
added a new `TestIterateAgainstExpected()` function that compares iterator against
expected state. It locks a range of keys, creates an iterator, does
a random sequence of `Next/Prev` and compares against expected state.
This PR is based on that commit, the main changes include some logs
(for easier debugging if a test fails), a forward and backward scan to
cover the entire locked key range, and a flag for optionally turning on
this version of Iterator testing.
Added constraint that the checks against expected state in
`TestIterateAgainstExpected()` and in `TestGet()` are only turned on
when `--skip_verifydb` flag is not set.
Remove the change log introduced in https://github.com/facebook/rocksdb/issues/10553.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10538
Test Plan:
Run `db_stress` with `--verify_iterator_with_expected_state_one_in=1`,
and a large `--iterpercent` and `--num_iterations`. Checked `op_logs`
manually to ensure expected coverage. Tweaked part of the code in
https://github.com/facebook/rocksdb/issues/10449 and stress test was able to catch it.
- internally run various flavor of crash test
Reviewed By: ajkr
Differential Revision: D38847269
Pulled By: cbi42
fbshipit-source-id: 8b4402a9bba9f6cfa08051943cd672579d489599
2022-08-24 21:59:50 +00:00
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
2024-05-10 19:34:53 +00:00
|
|
|
bool VerifyOrSyncValue(int cf, int64_t key, const ReadOptions& opts,
|
2022-04-27 13:01:09 +00:00
|
|
|
SharedState* shared, const std::string& value_from_db,
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
std::string msg_prefix, const Status& s) const {
|
2019-12-09 07:49:32 +00:00
|
|
|
if (shared->HasVerificationFailedYet()) {
|
|
|
|
return false;
|
|
|
|
}
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
const ExpectedValue expected_value = shared->Get(cf, key);
|
2023-01-31 18:17:48 +00:00
|
|
|
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
if (expected_value.PendingWrite() || expected_value.PendingDelete()) {
|
2022-04-27 13:01:09 +00:00
|
|
|
if (s.ok()) {
|
|
|
|
// Value exists in db, update state to reflect that
|
|
|
|
Slice slice(value_from_db);
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
uint32_t value_base = GetValueBase(slice);
|
|
|
|
shared->SyncPut(cf, key, value_base);
|
2024-01-12 20:23:09 +00:00
|
|
|
return true;
|
2022-04-27 13:01:09 +00:00
|
|
|
} else if (s.IsNotFound()) {
|
|
|
|
// Value doesn't exist in db, update state to reflect that
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
shared->SyncDelete(cf, key);
|
2024-01-12 20:23:09 +00:00
|
|
|
return true;
|
2024-08-15 19:32:59 +00:00
|
|
|
} else {
|
|
|
|
assert(false);
|
2022-04-27 13:01:09 +00:00
|
|
|
}
|
2019-12-09 07:49:32 +00:00
|
|
|
}
|
2024-01-09 22:20:08 +00:00
|
|
|
char expected_value_data[kValueMaxLen];
|
|
|
|
size_t expected_value_data_size =
|
|
|
|
GenerateValue(expected_value.GetValueBase(), expected_value_data,
|
|
|
|
sizeof(expected_value_data));
|
2019-12-09 07:49:32 +00:00
|
|
|
|
2024-05-10 19:34:53 +00:00
|
|
|
std::ostringstream read_u64ts;
|
|
|
|
if (opts.timestamp) {
|
|
|
|
read_u64ts << " while read with timestamp: ";
|
|
|
|
uint64_t read_ts;
|
|
|
|
if (DecodeU64Ts(*opts.timestamp, &read_ts).ok()) {
|
|
|
|
read_u64ts << std::to_string(read_ts) << ", ";
|
|
|
|
} else {
|
|
|
|
read_u64ts << s.ToString()
|
|
|
|
<< " Encoded read timestamp: " << opts.timestamp->ToString()
|
|
|
|
<< ", ";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
// compare value_from_db with the value in the shared state
|
2019-12-09 07:49:32 +00:00
|
|
|
if (s.ok()) {
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
const Slice slice(value_from_db);
|
|
|
|
const uint32_t value_base_from_db = GetValueBase(slice);
|
|
|
|
if (ExpectedValueHelper::MustHaveNotExisted(expected_value,
|
2023-10-02 23:07:39 +00:00
|
|
|
expected_value)) {
|
2024-05-10 19:34:53 +00:00
|
|
|
VerificationAbort(
|
|
|
|
shared, msg_prefix + ": Unexpected value found" + read_u64ts.str(),
|
|
|
|
cf, key, value_from_db, "");
|
2019-12-09 07:49:32 +00:00
|
|
|
return false;
|
|
|
|
}
|
2023-10-02 23:07:39 +00:00
|
|
|
if (!ExpectedValueHelper::InExpectedValueBaseRange(
|
|
|
|
value_base_from_db, expected_value, expected_value)) {
|
2024-05-10 19:34:53 +00:00
|
|
|
VerificationAbort(
|
|
|
|
shared, msg_prefix + ": Unexpected value found" + read_u64ts.str(),
|
|
|
|
cf, key, value_from_db,
|
|
|
|
Slice(expected_value_data, expected_value_data_size));
|
2023-10-02 23:07:39 +00:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
// TODO: are the length/memcmp() checks repetitive?
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
if (value_from_db.length() != expected_value_data_size) {
|
2023-01-27 15:45:25 +00:00
|
|
|
VerificationAbort(shared,
|
2024-05-10 19:34:53 +00:00
|
|
|
msg_prefix + ": Length of value read is not equal" +
|
|
|
|
read_u64ts.str(),
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
cf, key, value_from_db,
|
|
|
|
Slice(expected_value_data, expected_value_data_size));
|
2019-12-09 07:49:32 +00:00
|
|
|
return false;
|
|
|
|
}
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
if (memcmp(value_from_db.data(), expected_value_data,
|
|
|
|
expected_value_data_size) != 0) {
|
2023-01-27 15:45:25 +00:00
|
|
|
VerificationAbort(shared,
|
2024-05-10 19:34:53 +00:00
|
|
|
msg_prefix + ": Contents of value read don't match" +
|
|
|
|
read_u64ts.str(),
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
cf, key, value_from_db,
|
|
|
|
Slice(expected_value_data, expected_value_data_size));
|
2019-12-09 07:49:32 +00:00
|
|
|
return false;
|
|
|
|
}
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
} else if (s.IsNotFound()) {
|
|
|
|
if (ExpectedValueHelper::MustHaveExisted(expected_value,
|
|
|
|
expected_value)) {
|
|
|
|
VerificationAbort(
|
2024-05-10 19:34:53 +00:00
|
|
|
shared,
|
|
|
|
msg_prefix + ": Value not found " + read_u64ts.str() + s.ToString(),
|
|
|
|
cf, key, "", Slice(expected_value_data, expected_value_data_size));
|
2019-12-09 07:49:32 +00:00
|
|
|
return false;
|
|
|
|
}
|
Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058)
Summary:
**Context:**
Current `NonBatchedOpsStressTest` does not allow multi-thread read (i.e, Get, Iterator) and write (i.e, Put, Merge) or delete to the same key. Every read or write/delete operation will acquire lock (`GetLocksForKeyRange`) on the target key to gain exclusive access to it. This does not align with RocksDB's nature of allowing multi-thread read and write/delete to the same key, that is concurrent threads can issue read/write/delete to RocksDB without external locking. Therefore this is a gap in our testing coverage.
To close the gap, biggest challenge remains in verifying db value against expected state in presence of parallel read and write/delete. The challenge is due to read/write/delete to the db and read/write to expected state is not within one atomic operation. Therefore we may not know the exact expected state of a certain db read, as by the time we read the expected state for that db read, another write to expected state for another db write to the same key might have changed the expected state.
**Summary:**
Credited to ajkr's idea, we now solve this challenge by breaking the 32-bits expected value of a key into different parts that can be read and write to in parallel.
Basically we divide the 32-bits expected value into `value_base` (corresponding to the previous whole 32 bits but now with some shrinking in the value base range we allow), `pending_write` (i.e, whether there is an ongoing concurrent write), `del_counter` (i.e, number of times a value has been deleted, analogous to value_base for write), `pending_delete` (similar to pending_write) and `deleted` (i.e whether a key is deleted).
Also, we need to use incremental `value_base` instead of random value base as before because we want to control the range of value base a correct db read result can possibly be in presence of parallel read and write. In that way, we can verify the correctness of the read against expected state more easily. This is at the cost of reducing the randomness of the value generated in NonBatchedOpsStressTest we are willing to accept.
(For detailed algorithm of how to use these parts to infer expected state of a key, see the PR)
Misc: hide value_base detail from callers of ExpectedState by abstracting related logics into ExpectedValue class
Pull Request resolved: https://github.com/facebook/rocksdb/pull/11058
Test Plan:
- Manual test of small number of keys (i.e, high chances of parallel read and write/delete to same key) with equally distributed read/write/deleted for 30 min
```
python3 tools/db_crashtest.py --simple {blackbox|whitebox} --sync_fault_injection=1 --skip_verifydb=0 --continuous_verification_interval=1000 --clear_column_family_one_in=0 --max_key=10 --column_families=1 --threads=32 --readpercent=25 --writepercent=25 --nooverwritepercent=0 --iterpercent=25 --verify_iterator_with_expected_state_one_in=1 --num_iterations=5 --delpercent=15 --delrangepercent=10 --range_deletion_width=5 --use_merge={0|1} --use_put_entity_one_in=0 --use_txn=0 --verify_before_write=0 --user_timestamp_size=0 --compact_files_one_in=1000 --compact_range_one_in=1000 --flush_one_in=1000 --get_property_one_in=1000 --ingest_external_file_one_in=100 --backup_one_in=100 --checkpoint_one_in=100 --approximate_size_one_in=0 --acquire_snapshot_one_in=100 --use_multiget=0 --prefixpercent=0 --get_live_files_one_in=1000 --manual_wal_flush_one_in=1000 --pause_background_one_in=1000 --target_file_size_base=524288 --write_buffer_size=524288 --verify_checksum_one_in=1000 --verify_db_one_in=1000
```
- Rehearsal stress test for normal parameter and aggressive parameter to see if such change can find what existing stress test can find (i.e, no regression in testing capability)
- [Ongoing]Try to find new bugs with this change that are not found by current NonBatchedOpsStressTest with no parallel read and write/delete to same key
Reviewed By: ajkr
Differential Revision: D42257258
Pulled By: hx235
fbshipit-source-id: e6fdc18f1fad3753e5ac91731483a644d9b5b6eb
2023-05-15 22:34:22 +00:00
|
|
|
} else {
|
2024-05-10 19:34:53 +00:00
|
|
|
VerificationAbort(
|
|
|
|
shared,
|
|
|
|
msg_prefix + "Non-OK status " + read_u64ts.str() + s.ToString(), cf,
|
|
|
|
key, "", Slice(expected_value_data, expected_value_data_size));
|
2024-01-09 22:20:08 +00:00
|
|
|
return false;
|
2019-12-09 07:49:32 +00:00
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
2022-05-02 20:25:45 +00:00
|
|
|
|
|
|
|
void PrepareTxnDbOptions(SharedState* shared,
|
|
|
|
TransactionDBOptions& txn_db_opts) override {
|
|
|
|
txn_db_opts.rollback_deletion_type_callback =
|
|
|
|
[shared](TransactionDB*, ColumnFamilyHandle*, const Slice& key) {
|
|
|
|
assert(shared);
|
|
|
|
uint64_t key_num = 0;
|
|
|
|
bool ok = GetIntVal(key.ToString(), &key_num);
|
|
|
|
assert(ok);
|
|
|
|
(void)ok;
|
|
|
|
return !shared->AllowsOverwrite(key_num);
|
|
|
|
};
|
|
|
|
}
|
2024-05-24 21:24:17 +00:00
|
|
|
|
|
|
|
void MaybeAddKeyToTxnForRYW(
|
|
|
|
ThreadState* thread, int column_family, int64_t key, Transaction* txn,
|
|
|
|
std::unordered_map<std::string, ExpectedValue>& ryw_expected_values) {
|
|
|
|
assert(thread);
|
|
|
|
assert(txn);
|
|
|
|
|
|
|
|
SharedState* const shared = thread->shared;
|
|
|
|
assert(shared);
|
|
|
|
|
2024-08-15 19:32:59 +00:00
|
|
|
const ExpectedValue expected_value =
|
|
|
|
thread->shared->Get(column_family, key);
|
|
|
|
bool may_exist = !ExpectedValueHelper::MustHaveNotExisted(expected_value,
|
|
|
|
expected_value);
|
|
|
|
if (!shared->AllowsOverwrite(key) && may_exist) {
|
2024-05-24 21:24:17 +00:00
|
|
|
// Just do read your write checks for keys that allow overwrites.
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
// With a 1 in 10 probability, insert the just added key in the batch
|
|
|
|
// into the transaction. This will create an overlap with the MultiGet
|
|
|
|
// keys and exercise some corner cases in the code
|
|
|
|
if (thread->rand.OneIn(10)) {
|
2024-05-28 23:54:10 +00:00
|
|
|
assert(column_family >= 0);
|
|
|
|
assert(column_family < static_cast<int>(column_families_.size()));
|
|
|
|
|
2024-05-24 21:24:17 +00:00
|
|
|
ColumnFamilyHandle* const cfh = column_families_[column_family];
|
|
|
|
assert(cfh);
|
|
|
|
|
|
|
|
const std::string k = Key(key);
|
|
|
|
|
|
|
|
enum class Op {
|
2024-05-28 23:54:10 +00:00
|
|
|
PutOrPutEntity,
|
2024-05-24 21:24:17 +00:00
|
|
|
Merge,
|
|
|
|
Delete,
|
|
|
|
// add new operations above this line
|
|
|
|
NumberOfOps
|
|
|
|
};
|
|
|
|
|
|
|
|
const Op op = static_cast<Op>(
|
|
|
|
thread->rand.Uniform(static_cast<int>(Op::NumberOfOps)));
|
|
|
|
|
|
|
|
Status s;
|
|
|
|
|
|
|
|
switch (op) {
|
2024-05-28 23:54:10 +00:00
|
|
|
case Op::PutOrPutEntity:
|
2024-05-24 21:24:17 +00:00
|
|
|
case Op::Merge: {
|
|
|
|
ExpectedValue put_value;
|
2024-05-28 23:54:10 +00:00
|
|
|
put_value.SyncPut(static_cast<uint32_t>(thread->rand.Uniform(
|
|
|
|
static_cast<int>(ExpectedValue::GetValueBaseMask()))));
|
2024-05-24 21:24:17 +00:00
|
|
|
ryw_expected_values[k] = put_value;
|
|
|
|
|
2024-05-28 23:54:10 +00:00
|
|
|
const uint32_t value_base = put_value.GetValueBase();
|
|
|
|
|
2024-05-24 21:24:17 +00:00
|
|
|
char value[100];
|
2024-05-28 23:54:10 +00:00
|
|
|
const size_t sz = GenerateValue(value_base, value, sizeof(value));
|
2024-05-24 21:24:17 +00:00
|
|
|
const Slice v(value, sz);
|
|
|
|
|
2024-05-28 23:54:10 +00:00
|
|
|
if (op == Op::PutOrPutEntity) {
|
|
|
|
if (FLAGS_use_put_entity_one_in > 0 &&
|
|
|
|
(value_base % FLAGS_use_put_entity_one_in) == 0) {
|
|
|
|
s = txn->PutEntity(cfh, k, GenerateWideColumns(value_base, v));
|
|
|
|
} else {
|
|
|
|
s = txn->Put(cfh, k, v);
|
|
|
|
}
|
2024-05-24 21:24:17 +00:00
|
|
|
} else {
|
|
|
|
s = txn->Merge(cfh, k, v);
|
|
|
|
}
|
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case Op::Delete: {
|
|
|
|
ExpectedValue delete_value;
|
2024-05-28 23:54:10 +00:00
|
|
|
delete_value.SyncDelete();
|
2024-05-24 21:24:17 +00:00
|
|
|
ryw_expected_values[k] = delete_value;
|
|
|
|
|
|
|
|
s = txn->Delete(cfh, k);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
default:
|
|
|
|
assert(false);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!s.ok()) {
|
|
|
|
fprintf(stderr,
|
|
|
|
"Transaction write error in read-your-own-write test: %s\n",
|
|
|
|
s.ToString().c_str());
|
|
|
|
shared->SafeTerminate();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2019-12-09 07:49:32 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
StressTest* CreateNonBatchedOpsStressTest() {
|
|
|
|
return new NonBatchedOpsStressTest();
|
|
|
|
}
|
|
|
|
|
2020-02-20 20:07:53 +00:00
|
|
|
} // namespace ROCKSDB_NAMESPACE
|
2024-11-08 05:24:21 +00:00
|
|
|
#endif // GFLAGS
|