Fix a bug in iterator with UDT + `ReadOptions::pin_data` (#12451)

Summary:
with https://github.com/facebook/rocksdb/issues/12414 enabling `ReadOptions::pin_data`, this bug surfaced as corrupted per key-value checksum during crash test. `saved_key_.GetUserKey()` could be pinned user key, so DBIter should not overwrite it.

In one case, it only surfaces when iterator skips many keys of the same user key. To stress that code path, this PR also added `max_sequential_skip_in_iterations` to crash test.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/12451

Test Plan:
- Set ReadOptions::pin_data to true, the bug can be reproed quickly with `./db_stress --persist_user_defined_timestamps=1 --user_timestamp_size=8 --writepercent=35 --delpercent=4 --delrangepercent=1 --iterpercent=20 --nooverwritepercent=1 --prefix_size=8 --prefixpercent=10 --readpercent=30 --memtable_protection_bytes_per_key=8 --block_protection_bytes_per_key=2 --clear_column_family_one_in=0`.
    - Set max_sequential_skip_in_iterations to 1 for the other occurrence of the bug.

Reviewed By: jowlyzhang

Differential Revision: D55003766

Pulled By: cbi42

fbshipit-source-id: 23e1049129456684dafb028b6132b70e0afc07fb
This commit is contained in:
Changyu Bi 2024-03-18 09:05:11 -07:00 committed by Facebook GitHub Bot
parent f2546b6623
commit 3d5be596a5
5 changed files with 33 additions and 14 deletions

View File

@ -712,16 +712,21 @@ bool DBIter::ReverseToForward() {
// not exist or may have different prefix than the current key().
// If that's the case, seek iter_ to current key.
if (!expect_total_order_inner_iter() || !iter_.Valid()) {
IterKey last_key;
ParsedInternalKey pikey(saved_key_.GetUserKey(), kMaxSequenceNumber,
kValueTypeForSeek);
if (timestamp_size_ > 0) {
std::string last_key;
if (timestamp_size_ == 0) {
AppendInternalKey(
&last_key, ParsedInternalKey(saved_key_.GetUserKey(),
kMaxSequenceNumber, kValueTypeForSeek));
} else {
// TODO: pre-create kTsMax.
const std::string kTsMax(timestamp_size_, '\xff');
pikey.SetTimestamp(kTsMax);
AppendInternalKeyWithDifferentTimestamp(
&last_key,
ParsedInternalKey(saved_key_.GetUserKey(), kMaxSequenceNumber,
kValueTypeForSeek),
kTsMax);
}
last_key.SetInternalKey(pikey);
iter_.Seek(last_key.GetInternalKey());
iter_.Seek(last_key);
RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
}
@ -1371,18 +1376,23 @@ bool DBIter::FindUserKeyBeforeSavedKey() {
if (num_skipped >= max_skip_) {
num_skipped = 0;
IterKey last_key;
ParsedInternalKey pikey(saved_key_.GetUserKey(), kMaxSequenceNumber,
kValueTypeForSeek);
if (timestamp_size_ > 0) {
std::string last_key;
if (timestamp_size_ == 0) {
AppendInternalKey(&last_key, ParsedInternalKey(saved_key_.GetUserKey(),
kMaxSequenceNumber,
kValueTypeForSeek));
} else {
// TODO: pre-create kTsMax.
const std::string kTsMax(timestamp_size_, '\xff');
pikey.SetTimestamp(kTsMax);
AppendInternalKeyWithDifferentTimestamp(
&last_key,
ParsedInternalKey(saved_key_.GetUserKey(), kMaxSequenceNumber,
kValueTypeForSeek),
kTsMax);
}
last_key.SetInternalKey(pikey);
// It would be more efficient to use SeekForPrev() here, but some
// iterators may not support it.
iter_.Seek(last_key.GetInternalKey());
iter_.Seek(last_key);
RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
if (!iter_.Valid()) {
break;

View File

@ -391,6 +391,7 @@ DECLARE_double(high_pri_pool_ratio);
DECLARE_double(low_pri_pool_ratio);
DECLARE_uint64(soft_pending_compaction_bytes_limit);
DECLARE_uint64(hard_pending_compaction_bytes_limit);
DECLARE_uint64(max_sequential_skip_in_iterations);
constexpr long KB = 1024;
constexpr int kRandomValueMaxFactor = 3;

View File

@ -36,6 +36,11 @@ DEFINE_int64(max_key, 1 * KB * KB,
DEFINE_int32(max_key_len, 3, "Maximum length of a key in 8-byte units");
DEFINE_uint64(max_sequential_skip_in_iterations,
ROCKSDB_NAMESPACE::Options().max_sequential_skip_in_iterations,
"Iterator will reseek after scanning this number of keys with"
"the same user key during Next/Prev().");
DEFINE_string(key_len_percent_dist, "",
"Percentages of keys of various lengths. For example, 1,30,69 "
"means 1% of keys are 8 bytes, 30% are 16 bytes, and 69% are "

View File

@ -3649,6 +3649,8 @@ void InitializeOptionsFromFlags(
FLAGS_soft_pending_compaction_bytes_limit;
options.hard_pending_compaction_bytes_limit =
FLAGS_hard_pending_compaction_bytes_limit;
options.max_sequential_skip_in_iterations =
FLAGS_max_sequential_skip_in_iterations;
}
void InitializeOptionsGeneral(

View File

@ -96,6 +96,7 @@ default_params = {
"max_bytes_for_level_base": 10485760,
# max_key has to be the same across invocations for verification to work, hence no lambda
"max_key": random.choice([100000, 25000000]),
"max_sequential_skip_in_iterations": lambda: random.choice([1, 2, 8, 16]),
"max_write_buffer_number": 3,
"mmap_read": lambda: random.randint(0, 1),
# Setting `nooverwritepercent > 0` is only possible because we do not vary