rocksdb/db_stress_tool/no_batched_ops_stress.cc

//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
//  This source code is licensed under both the GPLv2 (found in the
//  COPYING file in the root directory) and Apache 2.0 License
//  (found in the LICENSE.Apache file in the root directory).
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.

#include "db/dbformat.h"
#include "db_stress_tool/db_stress_listener.h"
#include "db_stress_tool/db_stress_shared_state.h"
#include "db_stress_tool/expected_state.h"
#include "rocksdb/status.h"
#ifdef GFLAGS
#include "db/wide/wide_columns_helper.h"
#include "db_stress_tool/db_stress_common.h"
#include "rocksdb/utilities/transaction_db.h"
#include "utilities/fault_injection_fs.h"

namespace ROCKSDB_NAMESPACE {
class NonBatchedOpsStressTest : public StressTest {
 public:
  NonBatchedOpsStressTest() = default;

  virtual ~NonBatchedOpsStressTest() = default;

  void VerifyDb(ThreadState* thread) const override {
    // This `ReadOptions` is for validation purposes. Ignore
    // `FLAGS_rate_limit_user_ops` to avoid slowing any validation.
    ReadOptions options(FLAGS_verify_checksum, true);
    std::string ts_str;
    Slice ts;
    if (FLAGS_user_timestamp_size > 0) {
      ts_str = GetNowNanos();
      ts = ts_str;
      options.timestamp = &ts;
    }

    auto shared = thread->shared;
    const int64_t max_key = shared->GetMaxKey();
    const int64_t keys_per_thread = max_key / shared->GetNumThreads();
    int64_t start = keys_per_thread * thread->tid;
    int64_t end = start + keys_per_thread;
    uint64_t prefix_to_use =
        (FLAGS_prefix_size < 0) ? 1 : static_cast<size_t>(FLAGS_prefix_size);

    if (thread->tid == shared->GetNumThreads() - 1) {
      end = max_key;
    }

    for (size_t cf = 0; cf < column_families_.size(); ++cf) {
      if (thread->shared->HasVerificationFailedYet()) {
        break;
      }

      enum class VerificationMethod {
        kIterator,
        kGet,
        kGetEntity,
        kMultiGet,
        kMultiGetEntity,
        kGetMergeOperands,
        // Add any new items above kNumberOfMethods
        kNumberOfMethods
      };

      constexpr int num_methods =
          static_cast<int>(VerificationMethod::kNumberOfMethods);

      const VerificationMethod method =
          static_cast<VerificationMethod>(thread->rand.Uniform(
              (FLAGS_user_timestamp_size > 0) ? num_methods - 1 : num_methods));

      if (method == VerificationMethod::kIterator) {
        std::unique_ptr<Iterator> iter(
            db_->NewIterator(options, column_families_[cf]));

        std::string seek_key = Key(start);
        iter->Seek(seek_key);

        Slice prefix(seek_key.data(), prefix_to_use);

        for (int64_t i = start; i < end; ++i) {
          if (thread->shared->HasVerificationFailedYet()) {
            break;
          }

          const std::string key = Key(i);
          const Slice k(key);
          const Slice pfx(key.data(), prefix_to_use);

          // Reseek when the prefix changes
          if (prefix_to_use > 0 && prefix.compare(pfx) != 0) {
            iter->Seek(k);
            seek_key = key;
            prefix = Slice(seek_key.data(), prefix_to_use);
          }

          Status s = iter->status();

          std::string from_db;

          if (iter->Valid()) {
            const int diff = iter->key().compare(k);

            if (diff > 0) {
              s = Status::NotFound();
            } else if (diff == 0) {
              if (!VerifyWideColumns(iter->value(), iter->columns())) {
                VerificationAbort(shared, static_cast<int>(cf), i,
                                  iter->value(), iter->columns());
              }

              from_db = iter->value().ToString();
              iter->Next();
            } else {
              assert(diff < 0);

              VerificationAbort(shared, "An out of range key was found",
                                static_cast<int>(cf), i);
            }
          } else {
            // The iterator found no value for the key in question, so do not
            // move to the next item in the iterator
            s = Status::NotFound();
          }

          VerifyOrSyncValue(static_cast<int>(cf), i, options, shared, from_db,
                            /* msg_prefix */ "Iterator verification", s);

          if (!from_db.empty()) {
            PrintKeyValue(static_cast<int>(cf), static_cast<uint32_t>(i),
                          from_db.data(), from_db.size());
          }
        }
      } else if (method == VerificationMethod::kGet) {
        for (int64_t i = start; i < end; ++i) {
          if (thread->shared->HasVerificationFailedYet()) {
            break;
          }

          const std::string key = Key(i);
          std::string from_db;

          Status s = db_->Get(options, column_families_[cf], key, &from_db);

          VerifyOrSyncValue(static_cast<int>(cf), i, options, shared, from_db,
                            /* msg_prefix */ "Get verification", s);

          if (!from_db.empty()) {
            PrintKeyValue(static_cast<int>(cf), static_cast<uint32_t>(i),
                          from_db.data(), from_db.size());
          }
        }
      } else if (method == VerificationMethod::kGetEntity) {
        for (int64_t i = start; i < end; ++i) {
          if (thread->shared->HasVerificationFailedYet()) {
            break;
          }

          const std::string key = Key(i);
          PinnableWideColumns result;

          Status s =
              db_->GetEntity(options, column_families_[cf], key, &result);

          std::string from_db;

          if (s.ok()) {
            const WideColumns& columns = result.columns();

            if (WideColumnsHelper::HasDefaultColumn(columns)) {
              from_db = WideColumnsHelper::GetDefaultColumn(columns).ToString();
            }

            if (!VerifyWideColumns(columns)) {
              VerificationAbort(shared, static_cast<int>(cf), i, from_db,
                                columns);
            }
          }

          VerifyOrSyncValue(static_cast<int>(cf), i, options, shared, from_db,
                            /* msg_prefix */ "GetEntity verification", s);

          if (!from_db.empty()) {
            PrintKeyValue(static_cast<int>(cf), static_cast<uint32_t>(i),
                          from_db.data(), from_db.size());
          }
        }
      } else if (method == VerificationMethod::kMultiGet) {
        for (int64_t i = start; i < end;) {
          if (thread->shared->HasVerificationFailedYet()) {
            break;
          }

          // Keep the batch size to some reasonable value
          size_t batch_size = thread->rand.Uniform(128) + 1;
          batch_size = std::min<size_t>(batch_size, end - i);

          std::vector<std::string> key_strs(batch_size);
          std::vector<Slice> keys(batch_size);
          std::vector<PinnableSlice> values(batch_size);
          std::vector<Status> statuses(batch_size);

          for (size_t j = 0; j < batch_size; ++j) {
            key_strs[j] = Key(i + j);
            keys[j] = Slice(key_strs[j]);
          }

          db_->MultiGet(options, column_families_[cf], batch_size, keys.data(),
                        values.data(), statuses.data());

          for (size_t j = 0; j < batch_size; ++j) {
            const std::string from_db = values[j].ToString();

            VerifyOrSyncValue(static_cast<int>(cf), i + j, options, shared,
                              from_db, /* msg_prefix */ "MultiGet verification",
                              statuses[j]);

            if (!from_db.empty()) {
              PrintKeyValue(static_cast<int>(cf), static_cast<uint32_t>(i + j),
                            from_db.data(), from_db.size());
            }
          }

          i += batch_size;
        }
      } else if (method == VerificationMethod::kMultiGetEntity) {
        for (int64_t i = start; i < end;) {
          if (thread->shared->HasVerificationFailedYet()) {
            break;
          }

          // Keep the batch size to some reasonable value
          size_t batch_size = thread->rand.Uniform(128) + 1;
          batch_size = std::min<size_t>(batch_size, end - i);

          std::vector<std::string> key_strs(batch_size);
          std::vector<Slice> keys(batch_size);
          std::vector<PinnableWideColumns> results(batch_size);
          std::vector<Status> statuses(batch_size);

          for (size_t j = 0; j < batch_size; ++j) {
            key_strs[j] = Key(i + j);
            keys[j] = Slice(key_strs[j]);
          }

          db_->MultiGetEntity(options, column_families_[cf], batch_size,
                              keys.data(), results.data(), statuses.data());

          for (size_t j = 0; j < batch_size; ++j) {
            std::string from_db;

            if (statuses[j].ok()) {
              const WideColumns& columns = results[j].columns();

              if (WideColumnsHelper::HasDefaultColumn(columns)) {
                from_db =
                    WideColumnsHelper::GetDefaultColumn(columns).ToString();
              }

              if (!VerifyWideColumns(columns)) {
                VerificationAbort(shared, static_cast<int>(cf), i, from_db,
                                  columns);
              }
            }

            VerifyOrSyncValue(
                static_cast<int>(cf), i + j, options, shared, from_db,
                /* msg_prefix */ "MultiGetEntity verification", statuses[j]);

            if (!from_db.empty()) {
              PrintKeyValue(static_cast<int>(cf), static_cast<uint32_t>(i + j),
                            from_db.data(), from_db.size());
            }
          }

          i += batch_size;
        }
      } else {
        assert(method == VerificationMethod::kGetMergeOperands);

        // Start off with small size that will be increased later if necessary
        std::vector<PinnableSlice> values(4);

        GetMergeOperandsOptions merge_operands_info;
        merge_operands_info.expected_max_number_of_operands =
            static_cast<int>(values.size());

        for (int64_t i = start; i < end; ++i) {
          if (thread->shared->HasVerificationFailedYet()) {
            break;
          }

          const std::string key = Key(i);
          const Slice k(key);
          std::string from_db;
          int number_of_operands = 0;

          Status s = db_->GetMergeOperands(options, column_families_[cf], k,
                                           values.data(), &merge_operands_info,
                                           &number_of_operands);

          if (s.IsIncomplete()) {
            // Need to resize values as there are more than values.size() merge
            // operands on this key. Should only happen a few times when we
            // encounter a key that had more merge operands than any key seen so
            // far
            values.resize(number_of_operands);
            merge_operands_info.expected_max_number_of_operands =
                static_cast<int>(number_of_operands);
            s = db_->GetMergeOperands(options, column_families_[cf], k,
                                      values.data(), &merge_operands_info,
                                      &number_of_operands);
          }
          // Assumed here that GetMergeOperands always sets number_of_operand
          if (number_of_operands) {
            from_db = values[number_of_operands - 1].ToString();
          }

          VerifyOrSyncValue(static_cast<int>(cf), i, options, shared, from_db,
                            /* msg_prefix */ "GetMergeOperands verification",
                            s);

          if (!from_db.empty()) {
            PrintKeyValue(static_cast<int>(cf), static_cast<uint32_t>(i),
                          from_db.data(), from_db.size());
          }
        }
      }
    }
  }

  void ContinuouslyVerifyDb(ThreadState* thread) const override {
    if (!cmp_db_) {
      return;
    }
    assert(cmp_db_);
    assert(!cmp_cfhs_.empty());
    Status s = cmp_db_->TryCatchUpWithPrimary();
    if (!s.ok()) {
      assert(false);
      exit(1);
    }

    const auto checksum_column_family = [](Iterator* iter,
                                           uint32_t* checksum) -> Status {
      assert(nullptr != checksum);
      uint32_t ret = 0;
      for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
        ret = crc32c::Extend(ret, iter->key().data(), iter->key().size());
        ret = crc32c::Extend(ret, iter->value().data(), iter->value().size());
      }
      *checksum = ret;
      return iter->status();
    };

    auto* shared = thread->shared;
    assert(shared);
    const int64_t max_key = shared->GetMaxKey();
    ReadOptions read_opts(FLAGS_verify_checksum, true);
    std::string ts_str;
    Slice ts;
    if (FLAGS_user_timestamp_size > 0) {
      ts_str = GetNowNanos();
      ts = ts_str;
      read_opts.timestamp = &ts;
    }

    static Random64 rand64(shared->GetSeed());

    {
      uint32_t crc = 0;
      std::unique_ptr<Iterator> it(cmp_db_->NewIterator(read_opts));
      s = checksum_column_family(it.get(), &crc);
      if (!s.ok()) {
        fprintf(stderr, "Computing checksum of default cf: %s\n",
                s.ToString().c_str());
        assert(false);
      }
    }

    for (auto* handle : cmp_cfhs_) {
      if (thread->rand.OneInOpt(3)) {
        // Use Get()
        uint64_t key = rand64.Uniform(static_cast<uint64_t>(max_key));
        std::string key_str = Key(key);
        std::string value;
        std::string key_ts;
        s = cmp_db_->Get(read_opts, handle, key_str, &value,
                         FLAGS_user_timestamp_size > 0 ? &key_ts : nullptr);
        s.PermitUncheckedError();
      } else {
        // Use range scan
        std::unique_ptr<Iterator> iter(cmp_db_->NewIterator(read_opts, handle));
        uint32_t rnd = (thread->rand.Next()) % 4;
        if (0 == rnd) {
          // SeekToFirst() + Next()*5
          read_opts.total_order_seek = true;
          iter->SeekToFirst();
          for (int i = 0; i < 5 && iter->Valid(); ++i, iter->Next()) {
          }
        } else if (1 == rnd) {
          // SeekToLast() + Prev()*5
          read_opts.total_order_seek = true;
          iter->SeekToLast();
          for (int i = 0; i < 5 && iter->Valid(); ++i, iter->Prev()) {
          }
        } else if (2 == rnd) {
          // Seek() +Next()*5
          uint64_t key = rand64.Uniform(static_cast<uint64_t>(max_key));
          std::string key_str = Key(key);
          iter->Seek(key_str);
          for (int i = 0; i < 5 && iter->Valid(); ++i, iter->Next()) {
          }
        } else {
          // SeekForPrev() + Prev()*5
          uint64_t key = rand64.Uniform(static_cast<uint64_t>(max_key));
          std::string key_str = Key(key);
          iter->SeekForPrev(key_str);
          for (int i = 0; i < 5 && iter->Valid(); ++i, iter->Prev()) {
          }
        }
      }
    }
  }

  void MaybeClearOneColumnFamily(ThreadState* thread) override {
    if (FLAGS_column_families > 1) {
      if (thread->rand.OneInOpt(FLAGS_clear_column_family_one_in)) {
        // drop column family and then create it again (can't drop default)
        int cf = thread->rand.Next() % (FLAGS_column_families - 1) + 1;
        std::string new_name =
            std::to_string(new_column_family_name_.fetch_add(1));
        {
          MutexLock l(thread->shared->GetMutex());
          fprintf(
              stdout,
              "[CF %d] Dropping and recreating column family. new name: %s\n",
              cf, new_name.c_str());
        }
        thread->shared->LockColumnFamily(cf);
        Status s = db_->DropColumnFamily(column_families_[cf]);
        delete column_families_[cf];
        if (!s.ok()) {
          fprintf(stderr, "dropping column family error: %s\n",
                  s.ToString().c_str());
          thread->shared->SafeTerminate();
        }
        s = db_->CreateColumnFamily(ColumnFamilyOptions(options_), new_name,
                                    &column_families_[cf]);
        column_family_names_[cf] = new_name;
        thread->shared->ClearColumnFamily(cf);
        if (!s.ok()) {
          fprintf(stderr, "creating column family error: %s\n",
                  s.ToString().c_str());
          thread->shared->SafeTerminate();
        }
        thread->shared->UnlockColumnFamily(cf);
      }
    }
  }

  bool ShouldAcquireMutexOnKey() const override { return true; }

  bool IsStateTracked() const override { return true; }

  void TestKeyMayExist(ThreadState* thread, const ReadOptions& read_opts,
                       const std::vector<int>& rand_column_families,
                       const std::vector<int64_t>& rand_keys) override {
    auto cfh = column_families_[rand_column_families[0]];
    std::string key_str = Key(rand_keys[0]);
    Slice key = key_str;
    std::string ignore;
    ReadOptions read_opts_copy = read_opts;

    std::string read_ts_str;
    Slice read_ts_slice;
    if (FLAGS_user_timestamp_size > 0) {
      read_ts_str = GetNowNanos();
      read_ts_slice = read_ts_str;
      read_opts_copy.timestamp = &read_ts_slice;
    }
    bool read_older_ts = MaybeUseOlderTimestampForPointLookup(
        thread, read_ts_str, read_ts_slice, read_opts_copy);

    const ExpectedValue pre_read_expected_value =
        thread->shared->Get(rand_column_families[0], rand_keys[0]);
    bool key_may_exist = db_->KeyMayExist(read_opts_copy, cfh, key, &ignore);
    const ExpectedValue post_read_expected_value =
        thread->shared->Get(rand_column_families[0], rand_keys[0]);

    if (!key_may_exist && !FLAGS_skip_verifydb && !read_older_ts) {
      if (ExpectedValueHelper::MustHaveExisted(pre_read_expected_value,
                                               post_read_expected_value)) {
        thread->shared->SetVerificationFailure();
        fprintf(stderr,
                "error : inconsistent values for key %s: expected state has "
                "the key, TestKeyMayExist() returns false indicating the key "
                "must not exist.\n",
                key.ToString(true).c_str());
      }
    }
  }

  Status TestGet(ThreadState* thread, const ReadOptions& read_opts,
                 const std::vector<int>& rand_column_families,
                 const std::vector<int64_t>& rand_keys) override {
    auto cfh = column_families_[rand_column_families[0]];
    std::string key_str = Key(rand_keys[0]);
    Slice key = key_str;
    std::string from_db;

    ReadOptions read_opts_copy = read_opts;
    std::string read_ts_str;
    Slice read_ts_slice;
    if (FLAGS_user_timestamp_size > 0) {
      read_ts_str = GetNowNanos();
      read_ts_slice = read_ts_str;
      read_opts_copy.timestamp = &read_ts_slice;
    }
    bool read_older_ts = MaybeUseOlderTimestampForPointLookup(
        thread, read_ts_str, read_ts_slice, read_opts_copy);

    if (fault_fs_guard) {
      fault_fs_guard->GetAndResetInjectedThreadLocalErrorCount(
          FaultInjectionIOType::kRead);
      fault_fs_guard->GetAndResetInjectedThreadLocalErrorCount(
          FaultInjectionIOType::kMetadataRead);
      SharedState::ignore_read_error = false;
    }

    const ExpectedValue pre_read_expected_value =
        thread->shared->Get(rand_column_families[0], rand_keys[0]);
    Status s = db_->Get(read_opts_copy, cfh, key, &from_db);
    const ExpectedValue post_read_expected_value =
        thread->shared->Get(rand_column_families[0], rand_keys[0]);

    int injected_error_count = 0;
    if (fault_fs_guard) {
      injected_error_count = GetMinInjectedErrorCount(
          fault_fs_guard->GetAndResetInjectedThreadLocalErrorCount(
              FaultInjectionIOType::kRead),
          fault_fs_guard->GetAndResetInjectedThreadLocalErrorCount(
              FaultInjectionIOType::kMetadataRead));
      if (!SharedState::ignore_read_error && injected_error_count > 0 &&
          (s.ok() || s.IsNotFound())) {
        // Grab mutex so multiple thread don't try to print the
        // stack trace at the same time
        MutexLock l(thread->shared->GetMutex());
        fprintf(stderr, "Didn't get expected error from Get\n");
        fprintf(stderr, "Callstack that injected the fault\n");
        fault_fs_guard->PrintInjectedThreadLocalErrorBacktrace(
            FaultInjectionIOType::kRead);
        fault_fs_guard->PrintInjectedThreadLocalErrorBacktrace(
            FaultInjectionIOType::kMetadataRead);
        std::terminate();
      }
    }

    if (s.ok()) {
      // found case
      thread->stats.AddGets(1, 1);
      // we only have the latest expected state
      if (!FLAGS_skip_verifydb && !read_older_ts) {
        if (ExpectedValueHelper::MustHaveNotExisted(pre_read_expected_value,
                                                    post_read_expected_value)) {
          thread->shared->SetVerificationFailure();
          fprintf(stderr,
                  "error : inconsistent values for key %s (%" PRIi64
                  "): Get returns %s, "
                  "but expected state is \"deleted\".\n",
                  key.ToString(true).c_str(), rand_keys[0],
                  StringToHex(from_db).c_str());
        }
        Slice from_db_slice(from_db);
        uint32_t value_base_from_db = GetValueBase(from_db_slice);
        if (!ExpectedValueHelper::InExpectedValueBaseRange(
                value_base_from_db, pre_read_expected_value,
                post_read_expected_value)) {
          thread->shared->SetVerificationFailure();
          fprintf(stderr,
                  "error : inconsistent values for key %s (%" PRIi64
                  "): Get returns %s with "
                  "value base %d that falls out of expected state's value base "
                  "range.\n",
                  key.ToString(true).c_str(), rand_keys[0],
                  StringToHex(from_db).c_str(), value_base_from_db);
        }
      }
    } else if (s.IsNotFound()) {
      // not found case
      thread->stats.AddGets(1, 0);
      if (!FLAGS_skip_verifydb && !read_older_ts) {
        if (ExpectedValueHelper::MustHaveExisted(pre_read_expected_value,
                                                 post_read_expected_value)) {
          thread->shared->SetVerificationFailure();
          fprintf(stderr,
                  "error : inconsistent values for key %s (%" PRIi64
                  "): expected state has "
                  "the key, Get() returns NotFound.\n",
                  key.ToString(true).c_str(), rand_keys[0]);
        }
      }
    } else if (injected_error_count == 0 || !IsErrorInjectedAndRetryable(s)) {
      thread->shared->SetVerificationFailure();
      fprintf(stderr, "error : Get() returns %s for key: %s (%" PRIi64 ").\n",
              s.ToString().c_str(), key.ToString(true).c_str(), rand_keys[0]);
    }
    return s;
  }

  std::vector<Status> TestMultiGet(
      ThreadState* thread, const ReadOptions& read_opts,
      const std::vector<int>& rand_column_families,
      const std::vector<int64_t>& rand_keys) override {
    size_t num_keys = rand_keys.size();
    std::vector<std::string> key_str;
    std::vector<Slice> keys;
    key_str.reserve(num_keys);
    keys.reserve(num_keys);
    std::vector<PinnableSlice> values(num_keys);
    std::vector<Status> statuses(num_keys);
    // When Flags_use_txn is enabled, we also do a read your write check.
    std::unordered_map<std::string, ExpectedValue> ryw_expected_values;

    SharedState* shared = thread->shared;
    assert(shared);

    int column_family = rand_column_families[0];
    ColumnFamilyHandle* cfh = column_families_[column_family];

    bool do_consistency_check = FLAGS_check_multiget_consistency;

    ReadOptions readoptionscopy = read_opts;

    if (do_consistency_check) {
      readoptionscopy.snapshot = db_->GetSnapshot();
    }

    std::string read_ts_str;
    Slice read_ts_slice;
    MaybeUseOlderTimestampForPointLookup(thread, read_ts_str, read_ts_slice,
                                         readoptionscopy);

    readoptionscopy.rate_limiter_priority =
        FLAGS_rate_limit_user_ops ? Env::IO_USER : Env::IO_TOTAL;

    // To appease clang analyzer
    const bool use_txn = FLAGS_use_txn;

    // Create a transaction in order to write some data. The purpose is to
    // exercise WriteBatchWithIndex::MultiGetFromBatchAndDB. The transaction
    // will be rolled back once MultiGet returns.
    std::unique_ptr<Transaction> txn;
    if (use_txn) {
      // TODO(hx235): test fault injection with MultiGet() with transactions
      if (fault_fs_guard) {
        fault_fs_guard->DisableThreadLocalErrorInjection(
            FaultInjectionIOType::kRead);
        fault_fs_guard->DisableThreadLocalErrorInjection(
            FaultInjectionIOType::kMetadataRead);
      }
      WriteOptions wo;
      if (FLAGS_rate_limit_auto_wal_flush) {
        wo.rate_limiter_priority = Env::IO_USER;
      }
      Status s = NewTxn(wo, &txn);
      if (!s.ok()) {
        fprintf(stderr, "NewTxn error: %s\n", s.ToString().c_str());
        shared->SafeTerminate();
      }
    }
    for (size_t i = 0; i < num_keys; ++i) {
      uint64_t rand_key = rand_keys[i];
      key_str.emplace_back(Key(rand_key));
      keys.emplace_back(key_str.back());
      if (use_txn) {
        MaybeAddKeyToTxnForRYW(thread, column_family, rand_key, txn.get(),
                               ryw_expected_values);
      }
    }

    int injected_error_count = 0;

    if (!use_txn) {
      if (fault_fs_guard) {
        fault_fs_guard->GetAndResetInjectedThreadLocalErrorCount(
            FaultInjectionIOType::kRead);
        fault_fs_guard->GetAndResetInjectedThreadLocalErrorCount(
            FaultInjectionIOType::kMetadataRead);
        SharedState::ignore_read_error = false;
      }
      db_->MultiGet(readoptionscopy, cfh, num_keys, keys.data(), values.data(),
                    statuses.data());
      if (fault_fs_guard) {
        injected_error_count = GetMinInjectedErrorCount(
            fault_fs_guard->GetAndResetInjectedThreadLocalErrorCount(
                FaultInjectionIOType::kRead),
            fault_fs_guard->GetAndResetInjectedThreadLocalErrorCount(
                FaultInjectionIOType::kMetadataRead));

        if (injected_error_count > 0) {
          int stat_nok_nfound = 0;
          for (const auto& s : statuses) {
            if (!s.ok() && !s.IsNotFound()) {
              stat_nok_nfound++;
            }
          }
          if (!SharedState::ignore_read_error &&
              stat_nok_nfound < injected_error_count) {
            // Grab mutex so multiple thread don't try to print the
            // stack trace at the same time
            MutexLock l(shared->GetMutex());
            fprintf(stderr, "Didn't get expected error from MultiGet. \n");
            fprintf(stderr,
                    "num_keys %zu Expected %d errors, seen at least %d\n",
                    num_keys, injected_error_count, stat_nok_nfound);
            fprintf(stderr, "Callstack that injected the fault\n");
            fault_fs_guard->PrintInjectedThreadLocalErrorBacktrace(
                FaultInjectionIOType::kRead);
            fault_fs_guard->PrintInjectedThreadLocalErrorBacktrace(
                FaultInjectionIOType::kMetadataRead);
            std::terminate();
          }
        }
      }
    } else {
      assert(txn);
      txn->MultiGet(readoptionscopy, cfh, num_keys, keys.data(), values.data(),
                    statuses.data());
    }

    auto ryw_check =
        [](const Slice& key, const PinnableSlice& value, const Status& s,
           const std::optional<ExpectedValue>& ryw_expected_value) -> bool {
      if (!ryw_expected_value.has_value()) {
        return true;
      }
      const ExpectedValue& expected = ryw_expected_value.value();
      char expected_value[100];
      if (s.ok() &&
          ExpectedValueHelper::MustHaveNotExisted(expected, expected)) {
        fprintf(stderr,
                "MultiGet returned value different from what was "
                "written for key %s\n",
                key.ToString(true).c_str());
        fprintf(stderr,
                "MultiGet returned ok, transaction has non-committed "
                "delete.\n");
        return false;
      } else if (s.IsNotFound() &&
                 ExpectedValueHelper::MustHaveExisted(expected, expected)) {
        fprintf(stderr,
                "MultiGet returned value different from what was "
                "written for key %s\n",
                key.ToString(true).c_str());
        fprintf(stderr,
                "MultiGet returned not found, transaction has "
                "non-committed value.\n");
        return false;
      } else if (s.ok() &&
                 ExpectedValueHelper::MustHaveExisted(expected, expected)) {
        Slice from_txn_slice(value);
        size_t sz = GenerateValue(expected.GetValueBase(), expected_value,
                                  sizeof(expected_value));
        Slice expected_value_slice(expected_value, sz);
        if (expected_value_slice.compare(from_txn_slice) == 0) {
          return true;
        }
        fprintf(stderr,
                "MultiGet returned value different from what was "
                "written for key %s\n",
                key.ToString(true /* hex */).c_str());
        fprintf(stderr, "MultiGet returned value %s\n",
                from_txn_slice.ToString(true /* hex */).c_str());
        fprintf(stderr, "Transaction has non-committed value %s\n",
                expected_value_slice.ToString(true /* hex */).c_str());
        return false;
      }
      return true;
    };

    auto check_multiget =
        [&](const Slice& key, const PinnableSlice& expected_value,
            const Status& s,
            const std::optional<ExpectedValue>& ryw_expected_value) -> bool {
      //  Temporarily disable error injection for verification
      if (fault_fs_guard) {
        fault_fs_guard->DisableThreadLocalErrorInjection(
            FaultInjectionIOType::kRead);
        fault_fs_guard->DisableThreadLocalErrorInjection(
            FaultInjectionIOType::kMetadataRead);
      }

      bool check_multiget_res = true;
      bool is_consistent = true;
      bool is_ryw_correct = true;

      // If test does not use transaction, the consistency check for each key
      // included check results from db `Get` and db `MultiGet` are consistent.
      // If test use transaction, after consistency check, also do a read your
      // own write check.
      Status tmp_s;
      std::string value;

      if (use_txn) {
        assert(txn);
        ThreadStatusUtil::SetThreadOperation(
            ThreadStatus::OperationType::OP_GET);
        tmp_s = txn->Get(readoptionscopy, cfh, key, &value);
        ThreadStatusUtil::SetThreadOperation(
            ThreadStatus::OperationType::OP_MULTIGET);
      } else {
        ThreadStatusUtil::SetThreadOperation(
            ThreadStatus::OperationType::OP_GET);
        tmp_s = db_->Get(readoptionscopy, cfh, key, &value);
        ThreadStatusUtil::SetThreadOperation(
            ThreadStatus::OperationType::OP_MULTIGET);
      }
      if (!tmp_s.ok() && !tmp_s.IsNotFound()) {
        fprintf(stderr, "Get error: %s\n", s.ToString().c_str());
        is_consistent = false;
      } else if (!s.ok() && tmp_s.ok()) {
        fprintf(stderr,
                "MultiGet(%d) returned different results with key %s. "
                "Snapshot Seq No: %" PRIu64 "\n",
                column_family, key.ToString(true).c_str(),
                readoptionscopy.snapshot->GetSequenceNumber());
        fprintf(stderr, "Get returned ok, MultiGet returned not found\n");
        is_consistent = false;
      } else if (s.ok() && tmp_s.IsNotFound()) {
        fprintf(stderr,
                "MultiGet(%d) returned different results with key %s. "
                "Snapshot Seq No: %" PRIu64 "\n",
                column_family, key.ToString(true).c_str(),
                readoptionscopy.snapshot->GetSequenceNumber());
        fprintf(stderr, "MultiGet returned ok, Get returned not found\n");
        is_consistent = false;
      } else if (s.ok() && value != expected_value.ToString()) {
        fprintf(stderr,
                "MultiGet(%d) returned different results with key %s. "
                "Snapshot Seq No: %" PRIu64 "\n",
                column_family, key.ToString(true).c_str(),
                readoptionscopy.snapshot->GetSequenceNumber());
        fprintf(stderr, "MultiGet returned value %s\n",
                expected_value.ToString(true).c_str());
        fprintf(stderr, "Get returned value %s\n",
                Slice(value).ToString(true /* hex */).c_str());
        is_consistent = false;
      }

      // If test uses transaction, continue to do a read your own write check.
      if (is_consistent && use_txn) {
        is_ryw_correct = ryw_check(key, expected_value, s, ryw_expected_value);
      }

      if (!is_consistent) {
        fprintf(stderr, "TestMultiGet error: is_consistent is false\n");
        thread->stats.AddErrors(1);
        check_multiget_res = false;
        // Fail fast to preserve the DB state
        shared->SetVerificationFailure();
      } else if (!is_ryw_correct) {
        fprintf(stderr, "TestMultiGet error: is_ryw_correct is false\n");
        thread->stats.AddErrors(1);
        check_multiget_res = false;
        // Fail fast to preserve the DB state
        shared->SetVerificationFailure();
      } else if (s.ok()) {
        // found case
        thread->stats.AddGets(1, 1);
      } else if (s.IsNotFound()) {
        // not found case
        thread->stats.AddGets(1, 0);
      } else if (s.IsMergeInProgress() && use_txn) {
        // With txn this is sometimes expected.
        thread->stats.AddGets(1, 1);
      } else if (injected_error_count == 0 || !IsErrorInjectedAndRetryable(s)) {
        fprintf(stderr, "MultiGet error: %s\n", s.ToString().c_str());
        thread->stats.AddErrors(1);
        shared->SetVerificationFailure();
      }

      // Enable back error injection disbled for checking results
      if (fault_fs_guard) {
        fault_fs_guard->DisableThreadLocalErrorInjection(
            FaultInjectionIOType::kRead);
        fault_fs_guard->DisableThreadLocalErrorInjection(
            FaultInjectionIOType::kMetadataRead);
      }
      return check_multiget_res;
    };

    // Consistency check
    if (do_consistency_check && injected_error_count == 0) {
      size_t num_of_keys = keys.size();
      assert(values.size() == num_of_keys);
      assert(statuses.size() == num_of_keys);
      for (size_t i = 0; i < num_of_keys; ++i) {
        bool check_result = true;
        if (use_txn) {
          std::optional<ExpectedValue> ryw_expected_value;

          const auto it = ryw_expected_values.find(key_str[i]);
          if (it != ryw_expected_values.end()) {
            ryw_expected_value = it->second;
          }

          check_result = check_multiget(keys[i], values[i], statuses[i],
                                        ryw_expected_value);
        } else {
          check_result = check_multiget(keys[i], values[i], statuses[i],
                                        std::nullopt /* ryw_expected_value */);
        }
        if (!check_result) {
          break;
        }
      }
    }

    if (readoptionscopy.snapshot) {
      db_->ReleaseSnapshot(readoptionscopy.snapshot);
    }
    if (use_txn) {
      txn->Rollback().PermitUncheckedError();
      // Enable back error injection disbled for transactions
      if (fault_fs_guard) {
        fault_fs_guard->EnableThreadLocalErrorInjection(
            FaultInjectionIOType::kRead);
        fault_fs_guard->EnableThreadLocalErrorInjection(
            FaultInjectionIOType::kMetadataRead);
      }
    }
    return statuses;
  }

  void TestGetEntity(ThreadState* thread, const ReadOptions& read_opts,
                     const std::vector<int>& rand_column_families,
                     const std::vector<int64_t>& rand_keys) override {
    assert(thread);

    SharedState* const shared = thread->shared;
    assert(shared);

    assert(!rand_column_families.empty());

    const int column_family = rand_column_families[0];

    assert(column_family >= 0);
    assert(column_family < static_cast<int>(column_families_.size()));

    ColumnFamilyHandle* const cfh = column_families_[column_family];
    assert(cfh);

    assert(!rand_keys.empty());

    const int64_t key = rand_keys[0];
    const std::string key_str = Key(key);

    PinnableWideColumns columns_from_db;
    PinnableAttributeGroups attribute_groups_from_db;

    ReadOptions read_opts_copy = read_opts;
    std::string read_ts_str;
    Slice read_ts_slice;
    if (FLAGS_user_timestamp_size > 0) {
      read_ts_str = GetNowNanos();
      read_ts_slice = read_ts_str;
      read_opts_copy.timestamp = &read_ts_slice;
    }
    const bool read_older_ts = MaybeUseOlderTimestampForPointLookup(
        thread, read_ts_str, read_ts_slice, read_opts_copy);

    const ExpectedValue pre_read_expected_value =
        thread->shared->Get(column_family, key);

    if (fault_fs_guard) {
      fault_fs_guard->GetAndResetInjectedThreadLocalErrorCount(
          FaultInjectionIOType::kRead);
      fault_fs_guard->GetAndResetInjectedThreadLocalErrorCount(
          FaultInjectionIOType::kMetadataRead);
      SharedState::ignore_read_error = false;
    }

    Status s;
    if (FLAGS_use_attribute_group) {
      attribute_groups_from_db.emplace_back(cfh);
      s = db_->GetEntity(read_opts_copy, key_str, &attribute_groups_from_db);
      if (s.ok()) {
        s = attribute_groups_from_db.back().status();
      }
    } else {
      s = db_->GetEntity(read_opts_copy, cfh, key_str, &columns_from_db);
    }

    const ExpectedValue post_read_expected_value =
        thread->shared->Get(column_family, key);

    int injected_error_count = 0;
    if (fault_fs_guard) {
      injected_error_count = GetMinInjectedErrorCount(
          fault_fs_guard->GetAndResetInjectedThreadLocalErrorCount(
              FaultInjectionIOType::kRead),
          fault_fs_guard->GetAndResetInjectedThreadLocalErrorCount(
              FaultInjectionIOType::kMetadataRead));
      if (!SharedState::ignore_read_error && injected_error_count > 0 &&
          (s.ok() || s.IsNotFound())) {
        // Grab mutex so multiple thread don't try to print the
        // stack trace at the same time
        MutexLock l(thread->shared->GetMutex());
        fprintf(stderr, "Didn't get expected error from GetEntity\n");
        fprintf(stderr, "Callstack that injected the fault\n");
        fault_fs_guard->PrintInjectedThreadLocalErrorBacktrace(
            FaultInjectionIOType::kRead);
        fault_fs_guard->PrintInjectedThreadLocalErrorBacktrace(
            FaultInjectionIOType::kMetadataRead);
        std::terminate();
      }
    }

    if (s.ok()) {
      thread->stats.AddGets(1, 1);

      if (!FLAGS_skip_verifydb && !read_older_ts) {
        if (FLAGS_use_attribute_group) {
          assert(!attribute_groups_from_db.empty());
        }
        const WideColumns& columns =
            FLAGS_use_attribute_group
                ? attribute_groups_from_db.back().columns()
                : columns_from_db.columns();
        if (!VerifyWideColumns(columns)) {
          shared->SetVerificationFailure();
          fprintf(stderr,
                  "error : inconsistent columns returned by GetEntity for key "
                  "%s (%" PRIi64 "): %s\n",
                  StringToHex(key_str).c_str(), rand_keys[0],
                  WideColumnsToHex(columns).c_str());
        } else if (ExpectedValueHelper::MustHaveNotExisted(
                       pre_read_expected_value, post_read_expected_value)) {
          shared->SetVerificationFailure();
          fprintf(stderr,
                  "error : inconsistent values for key %s (%" PRIi64
                  "): GetEntity returns %s, "
                  "expected state does not have the key.\n",
                  StringToHex(key_str).c_str(), rand_keys[0],
                  WideColumnsToHex(columns).c_str());
        } else {
          const uint32_t value_base_from_db =
              GetValueBase(WideColumnsHelper::GetDefaultColumn(columns));
          if (!ExpectedValueHelper::InExpectedValueBaseRange(
                  value_base_from_db, pre_read_expected_value,
                  post_read_expected_value)) {
            shared->SetVerificationFailure();
            fprintf(
                stderr,
                "error : inconsistent values for key %s (%" PRIi64
                "): GetEntity returns %s "
                "with value base %d that falls out of expected state's value "
                "base range.\n",
                StringToHex(key_str).c_str(), rand_keys[0],
                WideColumnsToHex(columns).c_str(), value_base_from_db);
          }
        }
      }
    } else if (s.IsNotFound()) {
      thread->stats.AddGets(1, 0);

      if (!FLAGS_skip_verifydb && !read_older_ts) {
        if (ExpectedValueHelper::MustHaveExisted(pre_read_expected_value,
                                                 post_read_expected_value)) {
          shared->SetVerificationFailure();
          fprintf(stderr,
                  "error : inconsistent values for key %s (%" PRIi64
                  "): expected state has "
                  "the key, GetEntity returns NotFound.\n",
                  StringToHex(key_str).c_str(), rand_keys[0]);
        }
      }
    } else if (injected_error_count == 0 || !IsErrorInjectedAndRetryable(s)) {
      fprintf(stderr,
              "error : GetEntity() returns %s for key: %s (%" PRIi64 ").\n",
              s.ToString().c_str(), StringToHex(key_str).c_str(), rand_keys[0]);
      thread->shared->SetVerificationFailure();
    }
  }

  void TestMultiGetEntity(ThreadState* thread, const ReadOptions& read_opts,
                          const std::vector<int>& rand_column_families,
                          const std::vector<int64_t>& rand_keys) override {
    assert(thread);

    ManagedSnapshot snapshot_guard(db_);

    ReadOptions read_opts_copy(read_opts);
    read_opts_copy.snapshot = snapshot_guard.snapshot();

    assert(!rand_column_families.empty());

    const int column_family = rand_column_families[0];

    assert(column_family >= 0);
    assert(column_family < static_cast<int>(column_families_.size()));

    ColumnFamilyHandle* const cfh = column_families_[column_family];
    assert(cfh);

    assert(!rand_keys.empty());

    const size_t num_keys = rand_keys.size();

    std::unique_ptr<Transaction> txn;

    if (FLAGS_use_txn) {
      // TODO(hx235): test fault injection with MultiGetEntity() with
      // transactions
      if (fault_fs_guard) {
        fault_fs_guard->DisableThreadLocalErrorInjection(
            FaultInjectionIOType::kRead);
        fault_fs_guard->DisableThreadLocalErrorInjection(
            FaultInjectionIOType::kMetadataRead);
      }
      WriteOptions write_options;
      if (FLAGS_rate_limit_auto_wal_flush) {
        write_options.rate_limiter_priority = Env::IO_USER;
      }

      const Status s = NewTxn(write_options, &txn);
      if (!s.ok()) {
        fprintf(stderr, "NewTxn error: %s\n", s.ToString().c_str());
        thread->shared->SafeTerminate();
      }
    }

    std::vector<std::string> keys(num_keys);
    std::vector<Slice> key_slices(num_keys);
    std::unordered_map<std::string, ExpectedValue> ryw_expected_values;

    for (size_t i = 0; i < num_keys; ++i) {
      const int64_t key = rand_keys[i];

      keys[i] = Key(key);
      key_slices[i] = keys[i];

      if (FLAGS_use_txn) {
        MaybeAddKeyToTxnForRYW(thread, column_family, key, txn.get(),
                               ryw_expected_values);
      }
    }

    int injected_error_count = 0;

    auto verify_expected_errors = [&](auto get_status) {
      assert(fault_fs_guard);
      injected_error_count = GetMinInjectedErrorCount(
          fault_fs_guard->GetAndResetInjectedThreadLocalErrorCount(
              FaultInjectionIOType::kRead),
          fault_fs_guard->GetAndResetInjectedThreadLocalErrorCount(
              FaultInjectionIOType::kMetadataRead));
      if (injected_error_count) {
        int stat_nok_nfound = 0;
        for (size_t i = 0; i < num_keys; ++i) {
          const Status& s = get_status(i);
          if (!s.ok() && !s.IsNotFound()) {
            ++stat_nok_nfound;
          }
        }

        if (!SharedState::ignore_read_error &&
            stat_nok_nfound < injected_error_count) {
          // Grab mutex so multiple threads don't try to print the
          // stack trace at the same time
          assert(thread->shared);
          MutexLock l(thread->shared->GetMutex());

          fprintf(stderr, "Didn't get expected error from MultiGetEntity\n");
          fprintf(stderr, "num_keys %zu Expected %d errors, seen %d\n",
                  num_keys, injected_error_count, stat_nok_nfound);
          fprintf(stderr, "Call stack that injected the fault\n");
          fault_fs_guard->PrintInjectedThreadLocalErrorBacktrace(
              FaultInjectionIOType::kRead);
          fault_fs_guard->PrintInjectedThreadLocalErrorBacktrace(
              FaultInjectionIOType::kMetadataRead);
          std::terminate();
        }
      }
    };

    auto check_results = [&](auto get_columns, auto get_status,
                             auto do_extra_check, auto call_get_entity) {
      // Temporarily disable error injection for checking results
      if (fault_fs_guard) {
        fault_fs_guard->DisableThreadLocalErrorInjection(
            FaultInjectionIOType::kRead);
        fault_fs_guard->DisableThreadLocalErrorInjection(
            FaultInjectionIOType::kMetadataRead);
      }
      const bool check_get_entity =
          !injected_error_count && FLAGS_check_multiget_entity_consistency;

      for (size_t i = 0; i < num_keys; ++i) {
        const WideColumns& columns = get_columns(i);
        const Status& s = get_status(i);

        bool is_consistent = true;

        if (s.ok() && !VerifyWideColumns(columns)) {
          fprintf(
              stderr,
              "error : inconsistent columns returned by MultiGetEntity for key "
              "%s: %s\n",
              StringToHex(keys[i]).c_str(), WideColumnsToHex(columns).c_str());
          is_consistent = false;
        } else if (s.ok() || s.IsNotFound()) {
          if (!do_extra_check(keys[i], columns, s)) {
            is_consistent = false;
          } else if (check_get_entity) {
            PinnableWideColumns cmp_result;
            ThreadStatusUtil::SetThreadOperation(
                ThreadStatus::OperationType::OP_GETENTITY);
            const Status cmp_s = call_get_entity(key_slices[i], &cmp_result);

            if (!cmp_s.ok() && !cmp_s.IsNotFound()) {
              fprintf(stderr, "GetEntity error: %s\n",
                      cmp_s.ToString().c_str());
              is_consistent = false;
            } else if (cmp_s.IsNotFound()) {
              if (s.ok()) {
                fprintf(
                    stderr,
                    "Inconsistent results for key %s: MultiGetEntity returned "
                    "ok, GetEntity returned not found\n",
                    StringToHex(keys[i]).c_str());
                is_consistent = false;
              }
            } else {
              assert(cmp_s.ok());

              if (s.IsNotFound()) {
                fprintf(
                    stderr,
                    "Inconsistent results for key %s: MultiGetEntity returned "
                    "not found, GetEntity returned ok\n",
                    StringToHex(keys[i]).c_str());
                is_consistent = false;
              } else {
                assert(s.ok());

                const WideColumns& cmp_columns = cmp_result.columns();

                if (columns != cmp_columns) {
                  fprintf(stderr,
                          "Inconsistent results for key %s: MultiGetEntity "
                          "returned "
                          "%s, GetEntity returned %s\n",
                          StringToHex(keys[i]).c_str(),
                          WideColumnsToHex(columns).c_str(),
                          WideColumnsToHex(cmp_columns).c_str());
                  is_consistent = false;
                }
              }
            }
          }
        }

        if (!is_consistent) {
          fprintf(stderr,
                  "TestMultiGetEntity error: results are not consistent\n");
          thread->stats.AddErrors(1);
          // Fail fast to preserve the DB state
          thread->shared->SetVerificationFailure();
          break;
        } else if (s.ok()) {
          thread->stats.AddGets(1, 1);
        } else if (s.IsNotFound()) {
          thread->stats.AddGets(1, 0);
        } else if (injected_error_count == 0 ||
                   !IsErrorInjectedAndRetryable(s)) {
          fprintf(stderr, "MultiGetEntity error: %s\n", s.ToString().c_str());
          thread->stats.AddErrors(1);
          thread->shared->SetVerificationFailure();
        }
      }
      // Enable back error injection disbled for checking results
      if (fault_fs_guard) {
        fault_fs_guard->EnableThreadLocalErrorInjection(
            FaultInjectionIOType::kRead);
        fault_fs_guard->EnableThreadLocalErrorInjection(
            FaultInjectionIOType::kMetadataRead);
      }
    };

    if (FLAGS_use_txn) {
      // Transactional/read-your-own-writes MultiGetEntity verification
      std::vector<PinnableWideColumns> results(num_keys);
      std::vector<Status> statuses(num_keys);

      assert(txn);
      txn->MultiGetEntity(read_opts_copy, cfh, num_keys, key_slices.data(),
                          results.data(), statuses.data());

      auto ryw_check = [&](const std::string& key, const WideColumns& columns,
                           const Status& s) -> bool {
        const auto it = ryw_expected_values.find(key);
        if (it == ryw_expected_values.end()) {
          return true;
        }

        const auto& ryw_expected_value = it->second;

        if (s.ok()) {
          if (ryw_expected_value.IsDeleted()) {
            fprintf(
                stderr,
                "MultiGetEntity failed the read-your-own-write check for key "
                "%s\n",
                Slice(key).ToString(true).c_str());
            fprintf(stderr,
                    "MultiGetEntity returned ok, transaction has non-committed "
                    "delete\n");
            return false;
          } else {
            const uint32_t value_base = ryw_expected_value.GetValueBase();
            char expected_value[100];
            const size_t sz = GenerateValue(value_base, expected_value,
                                            sizeof(expected_value));
            const Slice expected_slice(expected_value, sz);
            const WideColumns expected_columns =
                GenerateExpectedWideColumns(value_base, expected_slice);

            if (columns != expected_columns) {
              fprintf(
                  stderr,
                  "MultiGetEntity failed the read-your-own-write check for key "
                  "%s\n",
                  Slice(key).ToString(true).c_str());
              fprintf(stderr, "MultiGetEntity returned %s\n",
                      WideColumnsToHex(columns).c_str());
              fprintf(stderr, "Transaction has non-committed write %s\n",
                      WideColumnsToHex(expected_columns).c_str());
              return false;
            }

            return true;
          }
        }

        assert(s.IsNotFound());
        if (!ryw_expected_value.IsDeleted()) {
          fprintf(stderr,
                  "MultiGetEntity failed the read-your-own-write check for key "
                  "%s\n",
                  Slice(key).ToString(true).c_str());
          fprintf(stderr,
                  "MultiGetEntity returned not found, transaction has "
                  "non-committed write\n");
          return false;
        }

        return true;
      };

      check_results([&](size_t i) { return results[i].columns(); },
                    [&](size_t i) { return statuses[i]; }, ryw_check,
                    [&](const Slice& key, PinnableWideColumns* result) {
                      return txn->GetEntity(read_opts_copy, cfh, key, result);
                    });

      txn->Rollback().PermitUncheckedError();
      // Enable back error injection disbled for transactions
      if (fault_fs_guard) {
        fault_fs_guard->EnableThreadLocalErrorInjection(
            FaultInjectionIOType::kRead);
        fault_fs_guard->EnableThreadLocalErrorInjection(
            FaultInjectionIOType::kMetadataRead);
      }
    } else if (FLAGS_use_attribute_group) {
      // AttributeGroup MultiGetEntity verification

      if (fault_fs_guard) {
        fault_fs_guard->GetAndResetInjectedThreadLocalErrorCount(
            FaultInjectionIOType::kRead);
        fault_fs_guard->GetAndResetInjectedThreadLocalErrorCount(
            FaultInjectionIOType::kMetadataRead);
        SharedState::ignore_read_error = false;
      }

      std::vector<PinnableAttributeGroups> results;
      results.reserve(num_keys);
      for (size_t i = 0; i < num_keys; ++i) {
        PinnableAttributeGroups attribute_groups;
        attribute_groups.emplace_back(cfh);
        results.emplace_back(std::move(attribute_groups));
      }

      db_->MultiGetEntity(read_opts_copy, num_keys, key_slices.data(),
                          results.data());

      if (fault_fs_guard) {
        verify_expected_errors(
            [&](size_t i) { return results[i][0].status(); });
      }

      // Compare against non-attribute-group GetEntity result
      check_results([&](size_t i) { return results[i][0].columns(); },
                    [&](size_t i) { return results[i][0].status(); },
                    [](const Slice& /* key */, const WideColumns& /* columns */,
                       const Status& /* s */) { return true; },
                    [&](const Slice& key, PinnableWideColumns* result) {
                      return db_->GetEntity(read_opts_copy, cfh, key, result);
                    });
    } else {
      // Non-AttributeGroup MultiGetEntity verification

      if (fault_fs_guard) {
        fault_fs_guard->GetAndResetInjectedThreadLocalErrorCount(
            FaultInjectionIOType::kRead);
        fault_fs_guard->GetAndResetInjectedThreadLocalErrorCount(
            FaultInjectionIOType::kMetadataRead);
        SharedState::ignore_read_error = false;
      }

      std::vector<PinnableWideColumns> results(num_keys);
      std::vector<Status> statuses(num_keys);

      db_->MultiGetEntity(read_opts_copy, cfh, num_keys, key_slices.data(),
                          results.data(), statuses.data());

      if (fault_fs_guard) {
        verify_expected_errors([&](size_t i) { return statuses[i]; });
      }

      check_results([&](size_t i) { return results[i].columns(); },
                    [&](size_t i) { return statuses[i]; },
                    [](const Slice& /* key */, const WideColumns& /* columns */,
                       const Status& /* s */) { return true; },
                    [&](const Slice& key, PinnableWideColumns* result) {
                      return db_->GetEntity(read_opts_copy, cfh, key, result);
                    });
    }
  }

  Status TestPrefixScan(ThreadState* thread, const ReadOptions& read_opts,
                        const std::vector<int>& rand_column_families,
                        const std::vector<int64_t>& rand_keys) override {
    assert(!rand_column_families.empty());
    assert(!rand_keys.empty());

    ColumnFamilyHandle* const cfh = column_families_[rand_column_families[0]];
    assert(cfh);

    const std::string key = Key(rand_keys[0]);
    const Slice prefix(key.data(), FLAGS_prefix_size);

    std::string upper_bound;
    Slice ub_slice;
    ReadOptions ro_copy = read_opts;

    // Randomly test with `iterate_upper_bound` and `prefix_same_as_start`
    //
    // Get the next prefix first and then see if we want to set it to be the
    // upper bound. We'll use the next prefix in an assertion later on
    if (GetNextPrefix(prefix, &upper_bound) && thread->rand.OneIn(2)) {
      // For half of the time, set the upper bound to the next prefix
      ub_slice = Slice(upper_bound);
      ro_copy.iterate_upper_bound = &ub_slice;
      if (FLAGS_use_sqfc_for_range_queries) {
        ro_copy.table_filter =
            sqfc_factory_->GetTableFilterForRangeQuery(prefix, ub_slice);
      }
    } else if (options_.prefix_extractor && thread->rand.OneIn(2)) {
      ro_copy.prefix_same_as_start = true;
    }

    std::string read_ts_str;
    Slice read_ts_slice;
    MaybeUseOlderTimestampForRangeScan(thread, read_ts_str, read_ts_slice,
                                       ro_copy);

    if (fault_fs_guard) {
      fault_fs_guard->GetAndResetInjectedThreadLocalErrorCount(
          FaultInjectionIOType::kRead);
      fault_fs_guard->GetAndResetInjectedThreadLocalErrorCount(
          FaultInjectionIOType::kMetadataRead);
      SharedState::ignore_read_error = false;
    }

    std::unique_ptr<Iterator> iter(db_->NewIterator(ro_copy, cfh));

    uint64_t count = 0;
    Status s;

    for (iter->Seek(prefix); iter->Valid(); iter->Next()) {
      // If upper or prefix bounds is specified, only keys of the target
      // prefix should show up. Otherwise, we need to manual exit the loop when
      // we see the first key that is not in the target prefix show up.
      if (ro_copy.iterate_upper_bound != nullptr ||
          ro_copy.prefix_same_as_start) {
        assert(iter->key().starts_with(prefix));
      } else if (!iter->key().starts_with(prefix)) {
        break;
      }
      ++count;

      // When iter_start_ts is set, iterator exposes internal keys, including
      // tombstones; however, we want to perform column validation only for
      // value-like types.
      if (ro_copy.iter_start_ts) {
        const ValueType value_type = ExtractValueType(iter->key());
        if (value_type != kTypeValue && value_type != kTypeBlobIndex &&
            value_type != kTypeWideColumnEntity) {
          continue;
        }
      }

      if (ro_copy.allow_unprepared_value) {
        if (!iter->PrepareValue()) {
          s = iter->status();
          break;
        }
      }

      if (!VerifyWideColumns(iter->value(), iter->columns())) {
        s = Status::Corruption("Value and columns inconsistent",
                               DebugString(iter->value(), iter->columns()));
        break;
      }
    }

    if (ro_copy.iter_start_ts == nullptr) {
      assert(count <= GetPrefixKeyCount(prefix.ToString(), upper_bound));
    }

    if (s.ok()) {
      s = iter->status();
    }

    int injected_error_count = 0;
    if (fault_fs_guard) {
      injected_error_count = GetMinInjectedErrorCount(
          fault_fs_guard->GetAndResetInjectedThreadLocalErrorCount(
              FaultInjectionIOType::kRead),
          fault_fs_guard->GetAndResetInjectedThreadLocalErrorCount(
              FaultInjectionIOType::kMetadataRead));
      if (!SharedState::ignore_read_error && injected_error_count > 0 &&
          s.ok()) {
        // Grab mutex so multiple thread don't try to print the
        // stack trace at the same time
        MutexLock l(thread->shared->GetMutex());
        fprintf(stderr, "Didn't get expected error from PrefixScan\n");
        fprintf(stderr, "Callstack that injected the fault\n");
        fault_fs_guard->PrintInjectedThreadLocalErrorBacktrace(
            FaultInjectionIOType::kRead);
        fault_fs_guard->PrintInjectedThreadLocalErrorBacktrace(
            FaultInjectionIOType::kMetadataRead);
        std::terminate();
      }
    }

    if (s.ok()) {
      thread->stats.AddPrefixes(1, count);
    } else if (injected_error_count == 0 || !IsErrorInjectedAndRetryable(s)) {
      fprintf(stderr,
              "TestPrefixScan error: %s with ReadOptions::iterate_upper_bound: "
              "%s, prefix_same_as_start: %s \n",
              s.ToString().c_str(),
              ro_copy.iterate_upper_bound
                  ? ro_copy.iterate_upper_bound->ToString(true).c_str()
                  : "nullptr",
              ro_copy.prefix_same_as_start ? "true" : "false");
      thread->shared->SetVerificationFailure();
    }

    return s;
  }

  Status TestPut(ThreadState* thread, WriteOptions& write_opts,
                 const ReadOptions& read_opts,
                 const std::vector<int>& rand_column_families,
                 const std::vector<int64_t>& rand_keys,
                 char (&value)[100]) override {
    assert(!rand_column_families.empty());
    assert(!rand_keys.empty());

    auto shared = thread->shared;
    assert(shared);

    const int64_t max_key = shared->GetMaxKey();

    int64_t rand_key = rand_keys[0];
    int rand_column_family = rand_column_families[0];
    std::string write_ts;

    std::unique_ptr<MutexLock> lock(
        new MutexLock(shared->GetMutexForKey(rand_column_family, rand_key)));
    while (!shared->AllowsOverwrite(rand_key) &&
           (FLAGS_use_merge || shared->Exists(rand_column_family, rand_key))) {
      lock.reset();

      rand_key = thread->rand.Next() % max_key;
      rand_column_family = thread->rand.Next() % FLAGS_column_families;

      lock.reset(
          new MutexLock(shared->GetMutexForKey(rand_column_family, rand_key)));
      if (FLAGS_user_timestamp_size > 0) {
        write_ts = GetNowNanos();
      }
    }

    if (write_ts.empty() && FLAGS_user_timestamp_size) {
      write_ts = GetNowNanos();
    }

    const std::string k = Key(rand_key);

    ColumnFamilyHandle* const cfh = column_families_[rand_column_family];
    assert(cfh);

    if (FLAGS_verify_before_write) {
      // Temporarily disable error injection for preparation
      if (fault_fs_guard) {
        fault_fs_guard->DisableThreadLocalErrorInjection(
            FaultInjectionIOType::kRead);
        fault_fs_guard->DisableThreadLocalErrorInjection(
            FaultInjectionIOType::kMetadataRead);
      }

      std::string from_db;
      Status s = db_->Get(read_opts, cfh, k, &from_db);
      bool res = VerifyOrSyncValue(
          rand_column_family, rand_key, read_opts, shared,
          /* msg_prefix */ "Pre-Put Get verification", from_db, s);

      // Enable back error injection disabled for preparation
      if (fault_fs_guard) {
        fault_fs_guard->EnableThreadLocalErrorInjection(
            FaultInjectionIOType::kRead);
        fault_fs_guard->EnableThreadLocalErrorInjection(
            FaultInjectionIOType::kMetadataRead);
      }
      if (!res) {
        return s;
      }
    }

    // To track the final write status
    Status s;
    // To track the initial write status
    Status initial_write_s;
    // To track whether WAL write may have succeeded during the initial failed
    // write
    bool initial_wal_write_may_succeed = true;

    PendingExpectedValue pending_expected_value =
        shared->PreparePut(rand_column_family, rand_key);

    const uint32_t value_base = pending_expected_value.GetFinalValueBase();
    const size_t sz = GenerateValue(value_base, value, sizeof(value));
    const Slice v(value, sz);

    uint64_t wait_for_recover_start_time = 0;
    do {
      // In order to commit the expected state for the initial write failed with
      // injected retryable error and successful WAL write, retry the write
      // until it succeeds after the recovery finishes
      if (!s.ok() && IsErrorInjectedAndRetryable(s) &&
          initial_wal_write_may_succeed) {
        std::this_thread::sleep_for(std::chrono::microseconds(1 * 1000 * 1000));
      }
      if (FLAGS_use_put_entity_one_in > 0 &&
          (value_base % FLAGS_use_put_entity_one_in) == 0) {
        if (!FLAGS_use_txn) {
          if (FLAGS_use_attribute_group) {
            s = db_->PutEntity(write_opts, k,
                               GenerateAttributeGroups({cfh}, value_base, v));
          } else {
            s = db_->PutEntity(write_opts, cfh, k,
                               GenerateWideColumns(value_base, v));
          }
        } else {
          s = ExecuteTransaction(write_opts, thread, [&](Transaction& txn) {
            return txn.PutEntity(cfh, k, GenerateWideColumns(value_base, v));
          });
        }
      } else if (FLAGS_use_timed_put_one_in > 0 &&
                 ((value_base + kLargePrimeForCommonFactorSkew) %
                  FLAGS_use_timed_put_one_in) == 0) {
        WriteBatch wb;
        uint64_t write_unix_time = GetWriteUnixTime(thread);
        s = wb.TimedPut(cfh, k, v, write_unix_time);
        if (s.ok()) {
          s = db_->Write(write_opts, &wb);
        }
      } else if (FLAGS_use_merge) {
        if (!FLAGS_use_txn) {
          if (FLAGS_user_timestamp_size == 0) {
            s = db_->Merge(write_opts, cfh, k, v);
          } else {
            s = db_->Merge(write_opts, cfh, k, write_ts, v);
          }
        } else {
          s = ExecuteTransaction(write_opts, thread, [&](Transaction& txn) {
            return txn.Merge(cfh, k, v);
          });
        }
      } else {
        if (!FLAGS_use_txn) {
          if (FLAGS_user_timestamp_size == 0) {
            s = db_->Put(write_opts, cfh, k, v);
          } else {
            s = db_->Put(write_opts, cfh, k, write_ts, v);
          }
        } else {
          s = ExecuteTransaction(write_opts, thread, [&](Transaction& txn) {
            return txn.Put(cfh, k, v);
          });
        }
      }
      UpdateIfInitialWriteFails(db_stress_env, s, &initial_write_s,
                                &initial_wal_write_may_succeed,
                                &wait_for_recover_start_time);

    } while (!s.ok() && IsErrorInjectedAndRetryable(s) &&
             initial_wal_write_may_succeed);

    if (!s.ok()) {
      pending_expected_value.Rollback();
      if (IsErrorInjectedAndRetryable(s)) {
        assert(!initial_wal_write_may_succeed);
        return s;
      } else if (FLAGS_inject_error_severity == 2) {
        if (!is_db_stopped_ && s.severity() >= Status::Severity::kFatalError) {
          is_db_stopped_ = true;
        } else if (!is_db_stopped_ ||
                   s.severity() < Status::Severity::kFatalError) {
          fprintf(stderr, "put or merge error: %s\n", s.ToString().c_str());
          thread->shared->SafeTerminate();
        }
      } else {
        fprintf(stderr, "put or merge error: %s\n", s.ToString().c_str());
        thread->shared->SafeTerminate();
      }
    } else {
      PrintWriteRecoveryWaitTimeIfNeeded(
          db_stress_env, initial_write_s, initial_wal_write_may_succeed,
          wait_for_recover_start_time, "TestPut");
      pending_expected_value.Commit();
      thread->stats.AddBytesForWrites(1, sz);
      PrintKeyValue(rand_column_family, static_cast<uint32_t>(rand_key), value,
                    sz);
    }
    return s;
  }

  Status TestDelete(ThreadState* thread, WriteOptions& write_opts,
                    const std::vector<int>& rand_column_families,
                    const std::vector<int64_t>& rand_keys) override {
    int64_t rand_key = rand_keys[0];
    int rand_column_family = rand_column_families[0];
    auto shared = thread->shared;

    std::unique_ptr<MutexLock> lock(
        new MutexLock(shared->GetMutexForKey(rand_column_family, rand_key)));

    // OPERATION delete
    std::string write_ts_str = GetNowNanos();
    Slice write_ts = write_ts_str;

    std::string key_str = Key(rand_key);
    Slice key = key_str;
    auto cfh = column_families_[rand_column_family];

    // To track the final write status
    Status s;
    // To track the initial write status
    Status initial_write_s;
    // To track whether WAL write may have succeeded during the initial failed
    // write
    bool initial_wal_write_may_succeed = true;

    // Use delete if the key may be overwritten and a single deletion
    // otherwise.
    if (shared->AllowsOverwrite(rand_key)) {
      PendingExpectedValue pending_expected_value =
          shared->PrepareDelete(rand_column_family, rand_key);

      uint64_t wait_for_recover_start_time = 0;
      do {
        // In order to commit the expected state for the initial write failed
        // with injected retryable error and successful WAL write, retry the
        // write until it succeeds after the recovery finishes
        if (!s.ok() && IsErrorInjectedAndRetryable(s) &&
            initial_wal_write_may_succeed) {
          std::this_thread::sleep_for(
              std::chrono::microseconds(1 * 1000 * 1000));
        }
        if (!FLAGS_use_txn) {
          if (FLAGS_user_timestamp_size == 0) {
            s = db_->Delete(write_opts, cfh, key);
          } else {
            s = db_->Delete(write_opts, cfh, key, write_ts);
          }
        } else {
          s = ExecuteTransaction(write_opts, thread, [&](Transaction& txn) {
            return txn.Delete(cfh, key);
          });
        }
        UpdateIfInitialWriteFails(db_stress_env, s, &initial_write_s,
                                  &initial_wal_write_may_succeed,
                                  &wait_for_recover_start_time);
      } while (!s.ok() && IsErrorInjectedAndRetryable(s) &&
               initial_wal_write_may_succeed);

      if (!s.ok()) {
        pending_expected_value.Rollback();
        if (IsErrorInjectedAndRetryable(s)) {
          assert(!initial_wal_write_may_succeed);
          return s;
        } else if (FLAGS_inject_error_severity == 2) {
          if (!is_db_stopped_ &&
              s.severity() >= Status::Severity::kFatalError) {
            is_db_stopped_ = true;
          } else if (!is_db_stopped_ ||
                     s.severity() < Status::Severity::kFatalError) {
            fprintf(stderr, "delete error: %s\n", s.ToString().c_str());
            thread->shared->SafeTerminate();
          }
        } else {
          fprintf(stderr, "delete error: %s\n", s.ToString().c_str());
          thread->shared->SafeTerminate();
        }
      } else {
        PrintWriteRecoveryWaitTimeIfNeeded(
            db_stress_env, initial_write_s, initial_wal_write_may_succeed,
            wait_for_recover_start_time, "TestDelete");
        pending_expected_value.Commit();
        thread->stats.AddDeletes(1);
      }
    } else {
      PendingExpectedValue pending_expected_value =
          shared->PrepareSingleDelete(rand_column_family, rand_key);

      uint64_t wait_for_recover_start_time = 0;
      do {
        // In order to commit the expected state for the initial write failed
        // with injected retryable error and successful WAL write, retry the
        // write until it succeeds after the recovery finishes
        if (!s.ok() && IsErrorInjectedAndRetryable(s) &&
            initial_wal_write_may_succeed) {
          std::this_thread::sleep_for(
              std::chrono::microseconds(1 * 1000 * 1000));
        }
        if (!FLAGS_use_txn) {
          if (FLAGS_user_timestamp_size == 0) {
            s = db_->SingleDelete(write_opts, cfh, key);
          } else {
            s = db_->SingleDelete(write_opts, cfh, key, write_ts);
          }
        } else {
          s = ExecuteTransaction(write_opts, thread, [&](Transaction& txn) {
            return txn.SingleDelete(cfh, key);
          });
        }
        UpdateIfInitialWriteFails(db_stress_env, s, &initial_write_s,
                                  &initial_wal_write_may_succeed,
                                  &wait_for_recover_start_time);
      } while (!s.ok() && IsErrorInjectedAndRetryable(s) &&
               initial_wal_write_may_succeed);

      if (!s.ok()) {
        pending_expected_value.Rollback();
        if (IsErrorInjectedAndRetryable(s)) {
          assert(!initial_wal_write_may_succeed);
          return s;
        } else if (FLAGS_inject_error_severity == 2) {
          if (!is_db_stopped_ &&
              s.severity() >= Status::Severity::kFatalError) {
            is_db_stopped_ = true;
          } else if (!is_db_stopped_ ||
                     s.severity() < Status::Severity::kFatalError) {
            fprintf(stderr, "single delete error: %s\n", s.ToString().c_str());
            thread->shared->SafeTerminate();
          }
        } else {
          fprintf(stderr, "single delete error: %s\n", s.ToString().c_str());
          thread->shared->SafeTerminate();
        }
      } else {
        PrintWriteRecoveryWaitTimeIfNeeded(
            db_stress_env, initial_write_s, initial_wal_write_may_succeed,
            wait_for_recover_start_time, "TestDelete");
        pending_expected_value.Commit();
        thread->stats.AddSingleDeletes(1);
      }
    }
    return s;
  }

  Status TestDeleteRange(ThreadState* thread, WriteOptions& write_opts,
                         const std::vector<int>& rand_column_families,
                         const std::vector<int64_t>& rand_keys) override {
    // OPERATION delete range
    std::vector<std::unique_ptr<MutexLock>> range_locks;
    // delete range does not respect disallowed overwrites. the keys for
    // which overwrites are disallowed are randomly distributed so it
    // could be expensive to find a range where each key allows
    // overwrites.
    int64_t rand_key = rand_keys[0];
    int rand_column_family = rand_column_families[0];
    auto shared = thread->shared;
    int64_t max_key = shared->GetMaxKey();
    if (rand_key > max_key - FLAGS_range_deletion_width) {
      rand_key =
          thread->rand.Next() % (max_key - FLAGS_range_deletion_width + 1);
    }
    GetDeleteRangeKeyLocks(thread, rand_column_family, rand_key, &range_locks);

    // To track the final write status
    Status s;
    // To track the initial write status
    Status initial_write_s;
    // To track whether WAL write may have succeeded during the initial failed
    // write
    bool initial_wal_write_may_succeed = true;

    std::vector<PendingExpectedValue> pending_expected_values =
        shared->PrepareDeleteRange(rand_column_family, rand_key,
                                   rand_key + FLAGS_range_deletion_width);

    const int covered = static_cast<int>(pending_expected_values.size());
    std::string keystr = Key(rand_key);
    Slice key = keystr;
    auto cfh = column_families_[rand_column_family];
    std::string end_keystr = Key(rand_key + FLAGS_range_deletion_width);
    Slice end_key = end_keystr;
    std::string write_ts_str;
    Slice write_ts;
    uint64_t wait_for_recover_start_time = 0;

    do {
      // In order to commit the expected state for the initial write failed with
      // injected retryable error and successful WAL write, retry the write
      // until it succeeds after the recovery finishes
      if (!s.ok() && IsErrorInjectedAndRetryable(s) &&
          initial_wal_write_may_succeed) {
        std::this_thread::sleep_for(std::chrono::microseconds(1 * 1000 * 1000));
      }
      if (FLAGS_user_timestamp_size) {
        write_ts_str = GetNowNanos();
        write_ts = write_ts_str;
        s = db_->DeleteRange(write_opts, cfh, key, end_key, write_ts);
      } else {
        s = db_->DeleteRange(write_opts, cfh, key, end_key);
      }
      UpdateIfInitialWriteFails(db_stress_env, s, &initial_write_s,
                                &initial_wal_write_may_succeed,
                                &wait_for_recover_start_time);
    } while (!s.ok() && IsErrorInjectedAndRetryable(s) &&
             initial_wal_write_may_succeed);

    if (!s.ok()) {
      for (PendingExpectedValue& pending_expected_value :
           pending_expected_values) {
        pending_expected_value.Rollback();
      }
      if (IsErrorInjectedAndRetryable(s)) {
        assert(!initial_wal_write_may_succeed);
        return s;
      } else if (FLAGS_inject_error_severity == 2) {
        if (!is_db_stopped_ && s.severity() >= Status::Severity::kFatalError) {
          is_db_stopped_ = true;
        } else if (!is_db_stopped_ ||
                   s.severity() < Status::Severity::kFatalError) {
          fprintf(stderr, "delete range error: %s\n", s.ToString().c_str());
          thread->shared->SafeTerminate();
        }
      } else {
        fprintf(stderr, "delete range error: %s\n", s.ToString().c_str());
        thread->shared->SafeTerminate();
      }
    } else {
      PrintWriteRecoveryWaitTimeIfNeeded(
          db_stress_env, initial_write_s, initial_wal_write_may_succeed,
          wait_for_recover_start_time, "TestDeleteRange");
      for (PendingExpectedValue& pending_expected_value :
           pending_expected_values) {
        pending_expected_value.Commit();
      }
      thread->stats.AddRangeDeletions(1);
      thread->stats.AddCoveredByRangeDeletions(covered);
    }
    return s;
  }

  void TestIngestExternalFile(ThreadState* thread,
                              const std::vector<int>& rand_column_families,
                              const std::vector<int64_t>& rand_keys) override {
    // When true, we create two sst files, the first one with regular puts for
    // a continuous range of keys, the second one with a standalone range
    // deletion for all the keys. This is to exercise the standalone range
    // deletion file's compaction input optimization.
    bool test_standalone_range_deletion = thread->rand.OneInOpt(
        FLAGS_test_ingest_standalone_range_deletion_one_in);
    std::vector<std::string> external_files;
    const std::string sst_filename =
        FLAGS_db + "/." + std::to_string(thread->tid) + ".sst";
    external_files.push_back(sst_filename);
    std::string standalone_rangedel_filename;
    if (test_standalone_range_deletion) {
      standalone_rangedel_filename = FLAGS_db + "/." +
                                     std::to_string(thread->tid) +
                                     "_standalone_rangedel.sst";
      external_files.push_back(standalone_rangedel_filename);
    }
    Status s;
    std::ostringstream ingest_options_oss;

    // Temporarily disable error injection for preparation
    if (fault_fs_guard) {
      fault_fs_guard->DisableThreadLocalErrorInjection(
          FaultInjectionIOType::kMetadataRead);
      fault_fs_guard->DisableThreadLocalErrorInjection(
          FaultInjectionIOType::kMetadataWrite);
    }

    for (const auto& filename : external_files) {
      if (db_stress_env->FileExists(filename).ok()) {
        // Maybe we terminated abnormally before, so cleanup to give this file
        // ingestion a clean slate
        s = db_stress_env->DeleteFile(filename);
      }
      if (!s.ok()) {
        return;
      }
    }

    if (fault_fs_guard) {
      fault_fs_guard->EnableThreadLocalErrorInjection(
          FaultInjectionIOType::kMetadataRead);
      fault_fs_guard->EnableThreadLocalErrorInjection(
          FaultInjectionIOType::kMetadataWrite);
    }

    SstFileWriter sst_file_writer(EnvOptions(options_), options_);
    SstFileWriter standalone_rangedel_sst_file_writer(EnvOptions(options_),
                                                      options_);
    if (s.ok()) {
      s = sst_file_writer.Open(sst_filename);
    }
    if (s.ok() && test_standalone_range_deletion) {
      s = standalone_rangedel_sst_file_writer.Open(
          standalone_rangedel_filename);
    }
    if (!s.ok()) {
      return;
    }

    int64_t key_base = rand_keys[0];
    int column_family = rand_column_families[0];
    std::vector<std::unique_ptr<MutexLock>> range_locks;
    range_locks.reserve(FLAGS_ingest_external_file_width);
    std::vector<int64_t> keys;
    keys.reserve(FLAGS_ingest_external_file_width);
    std::vector<uint32_t> values;
    values.reserve(FLAGS_ingest_external_file_width);
    std::vector<PendingExpectedValue> pending_expected_values;
    pending_expected_values.reserve(FLAGS_ingest_external_file_width);
    SharedState* shared = thread->shared;

    // Grab locks, add keys
    assert(FLAGS_nooverwritepercent < 100);
    for (int64_t key = key_base;
         key < shared->GetMaxKey() &&
         key < key_base + FLAGS_ingest_external_file_width;
         ++key) {
      if (key == key_base ||
          (key & ((1 << FLAGS_log2_keys_per_lock) - 1)) == 0) {
        range_locks.emplace_back(
            new MutexLock(shared->GetMutexForKey(column_family, key)));
      }
      if (test_standalone_range_deletion) {
        // Testing standalone range deletion needs a continuous range of keys.
        if (shared->AllowsOverwrite(key)) {
          if (keys.empty() || (!keys.empty() && keys.back() == key - 1)) {
            keys.push_back(key);
          } else {
            keys.clear();
            keys.push_back(key);
          }
        } else {
          if (keys.size() > 0) {
            break;
          } else {
            continue;
          }
        }
      } else {
        if (!shared->AllowsOverwrite(key)) {
          // We could alternatively include `key` that is deleted.
          continue;
        }
        keys.push_back(key);
      }
    }

    if (s.ok() && keys.empty()) {
      return;
    }

    // set pending state on expected values, create and ingest files.
    size_t total_keys = keys.size();
    for (size_t i = 0; s.ok() && i < total_keys; i++) {
      int64_t key = keys.at(i);
      char value[100];
      auto key_str = Key(key);
      const Slice k(key_str);
      Slice v;
      if (test_standalone_range_deletion) {
        assert(i == 0 || keys.at(i - 1) == key - 1);
        s = sst_file_writer.Put(k, v);
      } else {
        PendingExpectedValue pending_expected_value =
            shared->PreparePut(column_family, key);
        const uint32_t value_base = pending_expected_value.GetFinalValueBase();
        const size_t value_len =
            GenerateValue(value_base, value, sizeof(value));
        v = Slice(value, value_len);
        values.push_back(value_base);
        pending_expected_values.push_back(pending_expected_value);
        if (FLAGS_use_put_entity_one_in > 0 &&
            (value_base % FLAGS_use_put_entity_one_in) == 0) {
          WideColumns columns = GenerateWideColumns(values.back(), v);
          s = sst_file_writer.PutEntity(k, columns);
        } else {
          s = sst_file_writer.Put(k, v);
        }
      }
    }
    if (s.ok() && !keys.empty()) {
      s = sst_file_writer.Finish();
    }

    if (s.ok() && total_keys != 0 && test_standalone_range_deletion) {
      int64_t start_key = keys.at(0);
      int64_t end_key = keys.back() + 1;
      pending_expected_values =
          shared->PrepareDeleteRange(column_family, start_key, end_key);
      auto start_key_str = Key(start_key);
      const Slice start_key_slice(start_key_str);
      auto end_key_str = Key(end_key);
      const Slice end_key_slice(end_key_str);
      s = standalone_rangedel_sst_file_writer.DeleteRange(start_key_slice,
                                                          end_key_slice);
      if (s.ok()) {
        s = standalone_rangedel_sst_file_writer.Finish();
      }
    }
    if (s.ok()) {
      IngestExternalFileOptions ingest_options;
      ingest_options.move_files = thread->rand.OneInOpt(2);
      ingest_options.verify_checksums_before_ingest = thread->rand.OneInOpt(2);
      ingest_options.verify_checksums_readahead_size =
          thread->rand.OneInOpt(2) ? 1024 * 1024 : 0;
      ingest_options.fill_cache = thread->rand.OneInOpt(4);
      ingest_options_oss << "move_files: " << ingest_options.move_files
                         << ", verify_checksums_before_ingest: "
                         << ingest_options.verify_checksums_before_ingest
                         << ", verify_checksums_readahead_size: "
                         << ingest_options.verify_checksums_readahead_size
                         << ", fill_cache: " << ingest_options.fill_cache
                         << ", test_standalone_range_deletion: "
                         << test_standalone_range_deletion;
      s = db_->IngestExternalFile(column_families_[column_family],
                                  external_files, ingest_options);
    }
    if (!s.ok()) {
      for (PendingExpectedValue& pending_expected_value :
           pending_expected_values) {
        pending_expected_value.Rollback();
      }

      if (!IsErrorInjectedAndRetryable(s)) {
        fprintf(stderr,
                "file ingestion error: %s under specified "
                "IngestExternalFileOptions: %s (Empty string or "
                "missing field indicates default option or value is used)\n",
                s.ToString().c_str(), ingest_options_oss.str().c_str());
        thread->shared->SafeTerminate();
      }
    } else {
      for (PendingExpectedValue& pending_expected_value :
           pending_expected_values) {
        pending_expected_value.Commit();
      }
    }
  }

  // Given a key K, this creates an iterator which scans the range
  // [K, K + FLAGS_num_iterations) forward and backward.
  // Then does a random sequence of Next/Prev operations.
  Status TestIterateAgainstExpected(
      ThreadState* thread, const ReadOptions& read_opts,
      const std::vector<int>& rand_column_families,
      const std::vector<int64_t>& rand_keys) override {
    assert(thread);
    assert(!rand_column_families.empty());
    assert(!rand_keys.empty());

    auto shared = thread->shared;
    assert(shared);

    int64_t max_key = shared->GetMaxKey();

    const int64_t num_iter = static_cast<int64_t>(FLAGS_num_iterations);

    int64_t lb = rand_keys[0];
    if (lb > max_key - num_iter) {
      lb = thread->rand.Next() % (max_key - num_iter + 1);
    }

    const int64_t ub = lb + num_iter;

    const int rand_column_family = rand_column_families[0];

    // Testing parallel read and write to the same key with user timestamp
    // is not currently supported
    std::vector<std::unique_ptr<MutexLock>> range_locks;
    if (FLAGS_user_timestamp_size > 0) {
      range_locks = shared->GetLocksForKeyRange(rand_column_family, lb, ub);
    }

    ReadOptions ro(read_opts);
    if (FLAGS_prefix_size > 0) {
      ro.total_order_seek = true;
    }

    std::string read_ts_str;
    Slice read_ts;
    if (FLAGS_user_timestamp_size > 0) {
      read_ts_str = GetNowNanos();
      read_ts = read_ts_str;
      ro.timestamp = &read_ts;
    }

    std::string max_key_str;
    Slice max_key_slice;
    if (!FLAGS_destroy_db_initially) {
      max_key_str = Key(max_key);
      max_key_slice = max_key_str;
      // to restrict iterator from reading keys written in batched_op_stress
      // that do not have expected state updated and may not be parseable by
      // GetIntVal().
      ro.iterate_upper_bound = &max_key_slice;
    }
    std::string ub_str, lb_str;
    if (FLAGS_use_sqfc_for_range_queries) {
      ub_str = Key(ub);
      lb_str = Key(lb);
      ro.table_filter =
          sqfc_factory_->GetTableFilterForRangeQuery(lb_str, ub_str);
    }

    ColumnFamilyHandle* const cfh = column_families_[rand_column_family];
    assert(cfh);

    const std::size_t expected_values_size = static_cast<std::size_t>(ub - lb);
    std::vector<ExpectedValue> pre_read_expected_values;
    std::vector<ExpectedValue> post_read_expected_values;

    for (int64_t i = 0; i < static_cast<int64_t>(expected_values_size); ++i) {
      pre_read_expected_values.push_back(
          shared->Get(rand_column_family, i + lb));
    }
    std::unique_ptr<Iterator> iter;
    if (FLAGS_use_multi_cf_iterator) {
      std::vector<ColumnFamilyHandle*> cfhs;
      cfhs.reserve(rand_column_families.size());
      for (auto cf_index : rand_column_families) {
        cfhs.emplace_back(column_families_[cf_index]);
      }
      assert(!cfhs.empty());
      iter = db_->NewCoalescingIterator(ro, cfhs);
    } else {
      iter = std::unique_ptr<Iterator>(db_->NewIterator(ro, cfh));
    }

    for (int64_t i = 0; i < static_cast<int64_t>(expected_values_size); ++i) {
      post_read_expected_values.push_back(
          shared->Get(rand_column_family, i + lb));
    }

    assert(pre_read_expected_values.size() == expected_values_size &&
           pre_read_expected_values.size() == post_read_expected_values.size());

    std::string op_logs;

    auto check_columns = [&]() {
      assert(iter);
      assert(iter->Valid());

      if (!VerifyWideColumns(iter->value(), iter->columns())) {
        shared->SetVerificationFailure();

        fprintf(stderr,
                "Verification failed for key %s: "
                "Value and columns inconsistent: value: %s, columns: %s\n",
                Slice(iter->key()).ToString(/* hex */ true).c_str(),
                iter->value().ToString(/* hex */ true).c_str(),
                WideColumnsToHex(iter->columns()).c_str());
        fprintf(stderr, "Column family: %s, op_logs: %s\n",
                cfh->GetName().c_str(), op_logs.c_str());

        thread->stats.AddErrors(1);

        return false;
      }

      return true;
    };

    auto check_no_key_in_range = [&](int64_t start, int64_t end) {
      assert(start <= end);
      for (auto j = std::max(start, lb); j < std::min(end, ub); ++j) {
        std::size_t index = static_cast<std::size_t>(j - lb);
        assert(index < pre_read_expected_values.size() &&
               index < post_read_expected_values.size());
        const ExpectedValue pre_read_expected_value =
            pre_read_expected_values[index];
        const ExpectedValue post_read_expected_value =
            post_read_expected_values[index];
        if (ExpectedValueHelper::MustHaveExisted(pre_read_expected_value,
                                                 post_read_expected_value)) {
          // Fail fast to preserve the DB state.
          thread->shared->SetVerificationFailure();
          if (iter->Valid()) {
            fprintf(stderr,
                    "Verification failed. Expected state has key %s, iterator "
                    "is at key %s\n",
                    Slice(Key(j)).ToString(true).c_str(),
                    iter->key().ToString(true).c_str());
          } else {
            fprintf(stderr,
                    "Verification failed. Expected state has key %s, iterator "
                    "is invalid\n",
                    Slice(Key(j)).ToString(true).c_str());
          }
          fprintf(stderr, "Column family: %s, op_logs: %s\n",
                  cfh->GetName().c_str(), op_logs.c_str());
          thread->stats.AddErrors(1);
          return false;
        }
      }
      return true;
    };

    // Forward and backward scan to ensure we cover the entire range [lb, ub).
    // The random sequence Next and Prev test below tends to be very short
    // ranged.
    int64_t last_key = lb - 1;

    std::string key_str = Key(lb);
    iter->Seek(key_str);

    op_logs += "S " + Slice(key_str).ToString(true) + " ";

    uint64_t curr = 0;
    while (true) {
      assert(last_key < ub);

      if (iter->Valid() && ro.allow_unprepared_value) {
        op_logs += "*";

        if (!iter->PrepareValue()) {
          assert(!iter->Valid());
          assert(!iter->status().ok());
        }
      }

      if (!iter->Valid()) {
        if (!iter->status().ok()) {
          if (IsErrorInjectedAndRetryable(iter->status())) {
            return iter->status();
          } else {
            thread->shared->SetVerificationFailure();
            fprintf(stderr, "TestIterate against expected state error: %s\n",
                    iter->status().ToString().c_str());
            fprintf(stderr, "Column family: %s, op_logs: %s\n",
                    cfh->GetName().c_str(), op_logs.c_str());
            thread->stats.AddErrors(1);
            return iter->status();
          }
        }
        if (!check_no_key_in_range(last_key + 1, ub)) {
          return Status::OK();
        }
        break;
      }

      if (!check_columns()) {
        return Status::OK();
      }

      // iter is valid, the range (last_key, current key) was skipped
      GetIntVal(iter->key().ToString(), &curr);
      if (static_cast<int64_t>(curr) <= last_key) {
        thread->shared->SetVerificationFailure();
        fprintf(stderr,
                "TestIterateAgainstExpected failed: found unexpectedly small "
                "key\n");
        fprintf(stderr, "Column family: %s, op_logs: %s\n",
                cfh->GetName().c_str(), op_logs.c_str());
        fprintf(stderr, "Last op found key: %s, expected at least: %s\n",
                Slice(Key(curr)).ToString(true).c_str(),
                Slice(Key(last_key + 1)).ToString(true).c_str());
        thread->stats.AddErrors(1);
        return Status::OK();
      }
      if (!check_no_key_in_range(last_key + 1, static_cast<int64_t>(curr))) {
        return Status::OK();
      }

      last_key = static_cast<int64_t>(curr);
      if (last_key >= ub - 1) {
        break;
      }

      iter->Next();

      op_logs += "N";
    }

    // backward scan
    key_str = Key(ub - 1);
    iter->SeekForPrev(key_str);

    op_logs += " SFP " + Slice(key_str).ToString(true) + " ";

    last_key = ub;
    while (true) {
      assert(lb < last_key);

      if (iter->Valid() && ro.allow_unprepared_value) {
        op_logs += "*";

        if (!iter->PrepareValue()) {
          assert(!iter->Valid());
          assert(!iter->status().ok());
        }
      }

      if (!iter->Valid()) {
        if (!iter->status().ok()) {
          if (IsErrorInjectedAndRetryable(iter->status())) {
            return iter->status();
          } else {
            thread->shared->SetVerificationFailure();
            fprintf(stderr, "TestIterate against expected state error: %s\n",
                    iter->status().ToString().c_str());
            fprintf(stderr, "Column family: %s, op_logs: %s\n",
                    cfh->GetName().c_str(), op_logs.c_str());
            thread->stats.AddErrors(1);
            return iter->status();
          }
        }
        if (!check_no_key_in_range(lb, last_key)) {
          return Status::OK();
        }
        break;
      }

      if (!check_columns()) {
        return Status::OK();
      }

      // the range (current key, last key) was skipped
      GetIntVal(iter->key().ToString(), &curr);
      if (last_key <= static_cast<int64_t>(curr)) {
        thread->shared->SetVerificationFailure();
        fprintf(stderr,
                "TestIterateAgainstExpected failed: found unexpectedly large "
                "key\n");
        fprintf(stderr, "Column family: %s, op_logs: %s\n",
                cfh->GetName().c_str(), op_logs.c_str());
        fprintf(stderr, "Last op found key: %s, expected at most: %s\n",
                Slice(Key(curr)).ToString(true).c_str(),
                Slice(Key(last_key - 1)).ToString(true).c_str());
        thread->stats.AddErrors(1);
        return Status::OK();
      }
      if (!check_no_key_in_range(static_cast<int64_t>(curr + 1), last_key)) {
        return Status::OK();
      }

      last_key = static_cast<int64_t>(curr);
      if (last_key <= lb) {
        break;
      }

      iter->Prev();

      op_logs += "P";
    }

    // Write-prepared/write-unprepared transactions and multi-CF iterator do not
    // support Refresh() yet.
    if (!(FLAGS_use_txn && FLAGS_txn_write_policy != 0) &&
        !FLAGS_use_multi_cf_iterator && thread->rand.OneIn(2)) {
      pre_read_expected_values.clear();
      post_read_expected_values.clear();
      // Refresh after forward/backward scan to allow higher chance of SV
      // change.
      for (int64_t i = 0; i < static_cast<int64_t>(expected_values_size); ++i) {
        pre_read_expected_values.push_back(
            shared->Get(rand_column_family, i + lb));
      }
      Status rs = iter->Refresh();
      if (!rs.ok() && IsErrorInjectedAndRetryable(rs)) {
        return rs;
      }
      assert(rs.ok());
      op_logs += "Refresh ";
      for (int64_t i = 0; i < static_cast<int64_t>(expected_values_size); ++i) {
        post_read_expected_values.push_back(
            shared->Get(rand_column_family, i + lb));
      }

      assert(pre_read_expected_values.size() == expected_values_size &&
             pre_read_expected_values.size() ==
                 post_read_expected_values.size());
    }

    // start from middle of [lb, ub) otherwise it is easy to iterate out of
    // locked range
    const int64_t mid = lb + num_iter / 2;

    key_str = Key(mid);
    const Slice key(key_str);

    if (thread->rand.OneIn(2)) {
      iter->Seek(key);
      op_logs += " S " + key.ToString(true) + " ";
      if (!iter->Valid() && iter->status().ok()) {
        if (!check_no_key_in_range(mid, ub)) {
          return Status::OK();
        }
      } else if (iter->Valid()) {
        GetIntVal(iter->key().ToString(), &curr);
        if (static_cast<int64_t>(curr) < mid) {
          thread->shared->SetVerificationFailure();
          fprintf(stderr,
                  "TestIterateAgainstExpected failed: found unexpectedly small "
                  "key\n");
          fprintf(stderr, "Column family: %s, op_logs: %s\n",
                  cfh->GetName().c_str(), op_logs.c_str());
          fprintf(stderr, "Last op found key: %s, expected at least: %s\n",
                  Slice(Key(curr)).ToString(true).c_str(),
                  Slice(Key(mid)).ToString(true).c_str());
          thread->stats.AddErrors(1);
          return Status::OK();
        }
      }
    } else {
      iter->SeekForPrev(key);
      op_logs += " SFP " + key.ToString(true) + " ";
      if (!iter->Valid() && iter->status().ok()) {
        // iterator says nothing <= mid
        if (!check_no_key_in_range(lb, mid + 1)) {
          return Status::OK();
        }
      } else if (iter->Valid()) {
        GetIntVal(iter->key().ToString(), &curr);
        if (mid < static_cast<int64_t>(curr)) {
          thread->shared->SetVerificationFailure();
          fprintf(stderr,
                  "TestIterateAgainstExpected failed: found unexpectedly large "
                  "key\n");
          fprintf(stderr, "Column family: %s, op_logs: %s\n",
                  cfh->GetName().c_str(), op_logs.c_str());
          fprintf(stderr, "Last op found key: %s, expected at most: %s\n",
                  Slice(Key(curr)).ToString(true).c_str(),
                  Slice(Key(mid)).ToString(true).c_str());
          thread->stats.AddErrors(1);
          return Status::OK();
        }
      }
    }

    for (int64_t i = 0; i < num_iter && iter->Valid(); ++i) {
      if (ro.allow_unprepared_value) {
        op_logs += "*";

        if (!iter->PrepareValue()) {
          assert(!iter->Valid());
          assert(!iter->status().ok());
          break;
        }
      }

      if (!check_columns()) {
        return Status::OK();
      }

      GetIntVal(iter->key().ToString(), &curr);
      if (static_cast<int64_t>(curr) < lb) {
        iter->Next();
        op_logs += "N";
      } else if (static_cast<int64_t>(curr) >= ub) {
        iter->Prev();
        op_logs += "P";
      } else {
        const uint32_t value_base_from_db = GetValueBase(iter->value());
        std::size_t index = static_cast<std::size_t>(curr - lb);
        assert(index < pre_read_expected_values.size() &&
               index < post_read_expected_values.size());
        const ExpectedValue pre_read_expected_value =
            pre_read_expected_values[index];
        const ExpectedValue post_read_expected_value =
            post_read_expected_values[index];
        if (ExpectedValueHelper::MustHaveNotExisted(pre_read_expected_value,
                                                    post_read_expected_value) ||
            !ExpectedValueHelper::InExpectedValueBaseRange(
                value_base_from_db, pre_read_expected_value,
                post_read_expected_value)) {
          // Fail fast to preserve the DB state.
          thread->shared->SetVerificationFailure();
          fprintf(stderr,
                  "Verification failed: iterator has key %s, but expected "
                  "state does not.\n",
                  iter->key().ToString(true).c_str());
          fprintf(stderr, "Column family: %s, op_logs: %s\n",
                  cfh->GetName().c_str(), op_logs.c_str());
          thread->stats.AddErrors(1);
          break;
        }

        if (thread->rand.OneIn(2)) {
          iter->Next();
          op_logs += "N";
          if (!iter->Valid()) {
            break;
          }
          uint64_t next = 0;
          GetIntVal(iter->key().ToString(), &next);
          if (next <= curr) {
            thread->shared->SetVerificationFailure();
            fprintf(stderr,
                    "TestIterateAgainstExpected failed: found unexpectedly "
                    "small key\n");
            fprintf(stderr, "Column family: %s, op_logs: %s\n",
                    cfh->GetName().c_str(), op_logs.c_str());
            fprintf(stderr, "Last op found key: %s, expected at least: %s\n",
                    Slice(Key(next)).ToString(true).c_str(),
                    Slice(Key(curr + 1)).ToString(true).c_str());
            thread->stats.AddErrors(1);
            return Status::OK();
          }
          if (!check_no_key_in_range(static_cast<int64_t>(curr + 1),
                                     static_cast<int64_t>(next))) {
            return Status::OK();
          }
        } else {
          iter->Prev();
          op_logs += "P";
          if (!iter->Valid()) {
            break;
          }
          uint64_t prev = 0;
          GetIntVal(iter->key().ToString(), &prev);
          if (curr <= prev) {
            thread->shared->SetVerificationFailure();
            fprintf(stderr,
                    "TestIterateAgainstExpected failed: found unexpectedly "
                    "large key\n");
            fprintf(stderr, "Column family: %s, op_logs: %s\n",
                    cfh->GetName().c_str(), op_logs.c_str());
            fprintf(stderr, "Last op found key: %s, expected at most: %s\n",
                    Slice(Key(prev)).ToString(true).c_str(),
                    Slice(Key(curr - 1)).ToString(true).c_str());
            thread->stats.AddErrors(1);
            return Status::OK();
          }
          if (!check_no_key_in_range(static_cast<int64_t>(prev + 1),
                                     static_cast<int64_t>(curr))) {
            return Status::OK();
          }
        }
      }
    }

    if (!iter->status().ok()) {
      if (IsErrorInjectedAndRetryable(iter->status())) {
        return iter->status();
      } else {
        thread->shared->SetVerificationFailure();
        fprintf(stderr, "TestIterate against expected state error: %s\n",
                iter->status().ToString().c_str());
        fprintf(stderr, "Column family: %s, op_logs: %s\n",
                cfh->GetName().c_str(), op_logs.c_str());
        thread->stats.AddErrors(1);
        return iter->status();
      }
    }

    thread->stats.AddIterations(1);

    return Status::OK();
  }

  bool VerifyOrSyncValue(int cf, int64_t key, const ReadOptions& opts,
                         SharedState* shared, const std::string& value_from_db,
                         std::string msg_prefix, const Status& s) const {
    if (shared->HasVerificationFailedYet()) {
      return false;
    }
    const ExpectedValue expected_value = shared->Get(cf, key);

    if (expected_value.PendingWrite() || expected_value.PendingDelete()) {
      if (s.ok()) {
        // Value exists in db, update state to reflect that
        Slice slice(value_from_db);
        uint32_t value_base = GetValueBase(slice);
        shared->SyncPut(cf, key, value_base);
        return true;
      } else if (s.IsNotFound()) {
        // Value doesn't exist in db, update state to reflect that
        shared->SyncDelete(cf, key);
        return true;
      } else {
        assert(false);
      }
    }
    char expected_value_data[kValueMaxLen];
    size_t expected_value_data_size =
        GenerateValue(expected_value.GetValueBase(), expected_value_data,
                      sizeof(expected_value_data));

    std::ostringstream read_u64ts;
    if (opts.timestamp) {
      read_u64ts << " while read with timestamp: ";
      uint64_t read_ts;
      if (DecodeU64Ts(*opts.timestamp, &read_ts).ok()) {
        read_u64ts << std::to_string(read_ts) << ", ";
      } else {
        read_u64ts << s.ToString()
                   << " Encoded read timestamp: " << opts.timestamp->ToString()
                   << ", ";
      }
    }

    // compare value_from_db with the value in the shared state
    if (s.ok()) {
      const Slice slice(value_from_db);
      const uint32_t value_base_from_db = GetValueBase(slice);
      if (ExpectedValueHelper::MustHaveNotExisted(expected_value,
                                                  expected_value)) {
        VerificationAbort(
            shared, msg_prefix + ": Unexpected value found" + read_u64ts.str(),
            cf, key, value_from_db, "");
        return false;
      }
      if (!ExpectedValueHelper::InExpectedValueBaseRange(
              value_base_from_db, expected_value, expected_value)) {
        VerificationAbort(
            shared, msg_prefix + ": Unexpected value found" + read_u64ts.str(),
            cf, key, value_from_db,
            Slice(expected_value_data, expected_value_data_size));
        return false;
      }
      // TODO: are the length/memcmp() checks repetitive?
      if (value_from_db.length() != expected_value_data_size) {
        VerificationAbort(shared,
                          msg_prefix + ": Length of value read is not equal" +
                              read_u64ts.str(),
                          cf, key, value_from_db,
                          Slice(expected_value_data, expected_value_data_size));
        return false;
      }
      if (memcmp(value_from_db.data(), expected_value_data,
                 expected_value_data_size) != 0) {
        VerificationAbort(shared,
                          msg_prefix + ": Contents of value read don't match" +
                              read_u64ts.str(),
                          cf, key, value_from_db,
                          Slice(expected_value_data, expected_value_data_size));
        return false;
      }
    } else if (s.IsNotFound()) {
      if (ExpectedValueHelper::MustHaveExisted(expected_value,
                                               expected_value)) {
        VerificationAbort(
            shared,
            msg_prefix + ": Value not found " + read_u64ts.str() + s.ToString(),
            cf, key, "", Slice(expected_value_data, expected_value_data_size));
        return false;
      }
    } else {
      VerificationAbort(
          shared,
          msg_prefix + "Non-OK status " + read_u64ts.str() + s.ToString(), cf,
          key, "", Slice(expected_value_data, expected_value_data_size));
      return false;
    }
    return true;
  }

  void PrepareTxnDbOptions(SharedState* shared,
                           TransactionDBOptions& txn_db_opts) override {
    txn_db_opts.rollback_deletion_type_callback =
        [shared](TransactionDB*, ColumnFamilyHandle*, const Slice& key) {
          assert(shared);
          uint64_t key_num = 0;
          bool ok = GetIntVal(key.ToString(), &key_num);
          assert(ok);
          (void)ok;
          return !shared->AllowsOverwrite(key_num);
        };
  }

  void MaybeAddKeyToTxnForRYW(
      ThreadState* thread, int column_family, int64_t key, Transaction* txn,
      std::unordered_map<std::string, ExpectedValue>& ryw_expected_values) {
    assert(thread);
    assert(txn);

    SharedState* const shared = thread->shared;
    assert(shared);

    const ExpectedValue expected_value =
        thread->shared->Get(column_family, key);
    bool may_exist = !ExpectedValueHelper::MustHaveNotExisted(expected_value,
                                                              expected_value);
    if (!shared->AllowsOverwrite(key) && may_exist) {
      // Just do read your write checks for keys that allow overwrites.
      return;
    }

    // With a 1 in 10 probability, insert the just added key in the batch
    // into the transaction. This will create an overlap with the MultiGet
    // keys and exercise some corner cases in the code
    if (thread->rand.OneIn(10)) {
      assert(column_family >= 0);
      assert(column_family < static_cast<int>(column_families_.size()));

      ColumnFamilyHandle* const cfh = column_families_[column_family];
      assert(cfh);

      const std::string k = Key(key);

      enum class Op {
        PutOrPutEntity,
        Merge,
        Delete,
        // add new operations above this line
        NumberOfOps
      };

      const Op op = static_cast<Op>(
          thread->rand.Uniform(static_cast<int>(Op::NumberOfOps)));

      Status s;

      switch (op) {
        case Op::PutOrPutEntity:
        case Op::Merge: {
          ExpectedValue put_value;
          put_value.SyncPut(static_cast<uint32_t>(thread->rand.Uniform(
              static_cast<int>(ExpectedValue::GetValueBaseMask()))));
          ryw_expected_values[k] = put_value;

          const uint32_t value_base = put_value.GetValueBase();

          char value[100];
          const size_t sz = GenerateValue(value_base, value, sizeof(value));
          const Slice v(value, sz);

          if (op == Op::PutOrPutEntity) {
            if (FLAGS_use_put_entity_one_in > 0 &&
                (value_base % FLAGS_use_put_entity_one_in) == 0) {
              s = txn->PutEntity(cfh, k, GenerateWideColumns(value_base, v));
            } else {
              s = txn->Put(cfh, k, v);
            }
          } else {
            s = txn->Merge(cfh, k, v);
          }

          break;
        }
        case Op::Delete: {
          ExpectedValue delete_value;
          delete_value.SyncDelete();
          ryw_expected_values[k] = delete_value;

          s = txn->Delete(cfh, k);
          break;
        }
        default:
          assert(false);
      }

      if (!s.ok()) {
        fprintf(stderr,
                "Transaction write error in read-your-own-write test: %s\n",
                s.ToString().c_str());
        shared->SafeTerminate();
      }
    }
  }
};

StressTest* CreateNonBatchedOpsStressTest() {
  return new NonBatchedOpsStressTest();
}

}  // namespace ROCKSDB_NAMESPACE
#endif  // GFLAGS