rocksdb/db/repair.cc

//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
//  This source code is licensed under both the GPLv2 (found in the
//  COPYING file in the root directory) and Apache 2.0 License
//  (found in the LICENSE.Apache file in the root directory).
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// Repairer does best effort recovery to recover as much data as possible after
// a disaster without compromising consistency. It does not guarantee bringing
// the database to a time consistent state.
//
// Repair process is broken into 4 phases:
// (a) Find files
// (b) Convert logs to tables
// (c) Extract metadata
// (d) Write Descriptor
//
// (a) Find files
//
// The repairer goes through all the files in the directory, and classifies them
// based on their file name. Any file that cannot be identified by name will be
// ignored.
//
// (b) Convert logs to table
//
// Every log file that is active is replayed. All sections of the file where the
// checksum does not match is skipped over. We intentionally give preference to
// data consistency.
//
// (c) Extract metadata
//
// We scan every table to compute
// (1) smallest/largest for the table
// (2) largest sequence number in the table
// (3) oldest blob file referred to by the table (if applicable)
//
// If we are unable to scan the file, then we ignore the table.
//
// (d) Write Descriptor
//
// We generate descriptor contents:
//  - log number is set to zero
//  - next-file-number is set to 1 + largest file number we found
//  - last-sequence-number is set to largest sequence# found across
//    all tables (see 2c)
//  - compaction pointers are cleared
//  - every table file is added at level 0
//
// Possible optimization 1:
//   (a) Compute total size and use to pick appropriate max-level M
//   (b) Sort tables by largest sequence# in the table
//   (c) For each table: if it overlaps earlier table, place in level-0,
//       else place in level-M.
//   (d) We can provide options for time consistent recovery and unsafe recovery
//       (ignore checksum failure when applicable)
// Possible optimization 2:
//   Store per-table metadata (smallest, largest, largest-seq#, ...)
//   in the table's meta section to speed up ScanTable.

#include "db/version_builder.h"

#include <cinttypes>

#include "db/builder.h"
#include "db/db_impl/db_impl.h"
#include "db/dbformat.h"
#include "db/log_reader.h"
#include "db/log_writer.h"
#include "db/memtable.h"
#include "db/table_cache.h"
#include "db/version_edit.h"
#include "db/write_batch_internal.h"
#include "file/filename.h"
#include "file/writable_file_writer.h"
#include "logging/logging.h"
#include "options/cf_options.h"
#include "rocksdb/comparator.h"
#include "rocksdb/db.h"
#include "rocksdb/env.h"
#include "rocksdb/options.h"
#include "rocksdb/write_buffer_manager.h"
#include "table/scoped_arena_iterator.h"
#include "table/unique_id_impl.h"
#include "util/string_util.h"

namespace ROCKSDB_NAMESPACE {

namespace {

class Repairer {
 public:
  Repairer(const std::string& dbname, const DBOptions& db_options,
           const std::vector<ColumnFamilyDescriptor>& column_families,
           const ColumnFamilyOptions& default_cf_opts,
           const ColumnFamilyOptions& unknown_cf_opts, bool create_unknown_cfs)
      : dbname_(dbname),
        db_session_id_(DBImpl::GenerateDbSessionId(db_options.env)),
        env_(db_options.env),
        file_options_(),
        db_options_(SanitizeOptions(dbname_, db_options)),
        immutable_db_options_(ImmutableDBOptions(db_options_)),
        icmp_(default_cf_opts.comparator),
        default_cf_opts_(
            SanitizeOptions(immutable_db_options_, default_cf_opts)),
        default_iopts_(
            ImmutableOptions(immutable_db_options_, default_cf_opts_)),
        unknown_cf_opts_(
            SanitizeOptions(immutable_db_options_, unknown_cf_opts)),
        create_unknown_cfs_(create_unknown_cfs),
        raw_table_cache_(
            // TableCache can be small since we expect each table to be opened
            // once.
            NewLRUCache(10, db_options_.table_cache_numshardbits)),
        table_cache_(new TableCache(default_iopts_, &file_options_,
                                    raw_table_cache_.get(),
                                    /*block_cache_tracer=*/nullptr,
                                    /*io_tracer=*/nullptr, db_session_id_)),
        wb_(db_options_.db_write_buffer_size),
        wc_(db_options_.delayed_write_rate),
        vset_(dbname_, &immutable_db_options_, file_options_,
              raw_table_cache_.get(), &wb_, &wc_,
              /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
              /*db_id=*/"", db_session_id_, db_options.daily_offpeak_time_utc),
        next_file_number_(1),
        db_lock_(nullptr),
        closed_(false) {
    for (const auto& cfd : column_families) {
      cf_name_to_opts_[cfd.name] = cfd.options;
    }
  }

  const ColumnFamilyOptions* GetColumnFamilyOptions(
      const std::string& cf_name) {
    if (cf_name_to_opts_.find(cf_name) == cf_name_to_opts_.end()) {
      if (create_unknown_cfs_) {
        return &unknown_cf_opts_;
      }
      return nullptr;
    }
    return &cf_name_to_opts_[cf_name];
  }

  // Adds a column family to the VersionSet with cf_options_ and updates
  // manifest.
  Status AddColumnFamily(const std::string& cf_name, uint32_t cf_id) {
    // TODO: plumb Env::IOActivity;
    const ReadOptions read_options;
    const auto* cf_opts = GetColumnFamilyOptions(cf_name);
    if (cf_opts == nullptr) {
      return Status::Corruption("Encountered unknown column family with name=" +
                                cf_name + ", id=" + std::to_string(cf_id));
    }
    Options opts(db_options_, *cf_opts);
    MutableCFOptions mut_cf_opts(opts);

    VersionEdit edit;
    edit.SetComparatorName(opts.comparator->Name());
    edit.SetPersistUserDefinedTimestamps(opts.persist_user_defined_timestamps);
    edit.SetLogNumber(0);
    edit.SetColumnFamily(cf_id);
    ColumnFamilyData* cfd;
    cfd = nullptr;
    edit.AddColumnFamily(cf_name);

    mutex_.Lock();
    std::unique_ptr<FSDirectory> db_dir;
    Status status = env_->GetFileSystem()->NewDirectory(dbname_, IOOptions(),
                                                        &db_dir, nullptr);
    if (status.ok()) {
      status = vset_.LogAndApply(cfd, mut_cf_opts, read_options, &edit, &mutex_,
                                 db_dir.get(), false /* new_descriptor_log */,
                                 cf_opts);
    }
    mutex_.Unlock();
    return status;
  }

  Status Close() {
    Status s = Status::OK();
    if (!closed_) {
      if (db_lock_ != nullptr) {
        s = env_->UnlockFile(db_lock_);
        db_lock_ = nullptr;
      }
      closed_ = true;
    }
    return s;
  }

  ~Repairer() { Close().PermitUncheckedError(); }

  Status Run() {
    Status status = env_->LockFile(LockFileName(dbname_), &db_lock_);
    if (!status.ok()) {
      return status;
    }
    status = FindFiles();
    DBImpl* db_impl = nullptr;
    if (status.ok()) {
      // Discard older manifests and start a fresh one
      for (size_t i = 0; i < manifests_.size(); i++) {
        ArchiveFile(dbname_ + "/" + manifests_[i]);
      }
      // Just create a DBImpl temporarily so we can reuse NewDB()
      db_impl = new DBImpl(db_options_, dbname_);
      status = db_impl->NewDB(/*new_filenames=*/nullptr);
    }
    delete db_impl;

    if (status.ok()) {
      // Recover using the fresh manifest created by NewDB()
      status =
          vset_.Recover({{kDefaultColumnFamilyName, default_cf_opts_}}, false);
    }
    if (status.ok()) {
      // Need to scan existing SST files first so the column families are
      // created before we process WAL files
      ExtractMetaData();

      // ExtractMetaData() uses table_fds_ to know which SST files' metadata to
      // extract -- we need to clear it here since metadata for existing SST
      // files has been extracted already
      table_fds_.clear();
      ConvertLogFilesToTables();
      ExtractMetaData();
      status = AddTables();
    }
    if (status.ok()) {
      uint64_t bytes = 0;
      for (size_t i = 0; i < tables_.size(); i++) {
        bytes += tables_[i].meta.fd.GetFileSize();
      }
      ROCKS_LOG_WARN(db_options_.info_log,
                     "**** Repaired rocksdb %s; "
                     "recovered %" ROCKSDB_PRIszt " files; %" PRIu64
                     " bytes. "
                     "Some data may have been lost. "
                     "****",
                     dbname_.c_str(), tables_.size(), bytes);
    }
    return status;
  }

 private:
  struct TableInfo {
    FileMetaData meta;
    uint32_t column_family_id;
    std::string column_family_name;
  };

  std::string const dbname_;
  std::string db_session_id_;
  Env* const env_;
  const FileOptions file_options_;
  const DBOptions db_options_;
  const ImmutableDBOptions immutable_db_options_;
  const InternalKeyComparator icmp_;
  const ColumnFamilyOptions default_cf_opts_;
  const ImmutableOptions default_iopts_;  // table_cache_ holds reference
  const ColumnFamilyOptions unknown_cf_opts_;
  const bool create_unknown_cfs_;
  std::shared_ptr<Cache> raw_table_cache_;
  std::unique_ptr<TableCache> table_cache_;
  WriteBufferManager wb_;
  WriteController wc_;
  VersionSet vset_;
  std::unordered_map<std::string, ColumnFamilyOptions> cf_name_to_opts_;
  InstrumentedMutex mutex_;

  std::vector<std::string> manifests_;
  std::vector<FileDescriptor> table_fds_;
  std::vector<uint64_t> logs_;
  std::vector<TableInfo> tables_;
  uint64_t next_file_number_;
  // Lock over the persistent DB state. Non-nullptr iff successfully
  // acquired.
  FileLock* db_lock_;
  bool closed_;

  Status FindFiles() {
    std::vector<std::string> filenames;
    bool found_file = false;
    std::vector<std::string> to_search_paths;

    for (size_t path_id = 0; path_id < db_options_.db_paths.size(); path_id++) {
      to_search_paths.push_back(db_options_.db_paths[path_id].path);
    }

    // search wal_dir if user uses a customize wal_dir
    bool same = immutable_db_options_.IsWalDirSameAsDBPath(dbname_);
    if (!same) {
      to_search_paths.push_back(immutable_db_options_.wal_dir);
    }

    for (size_t path_id = 0; path_id < to_search_paths.size(); path_id++) {
      ROCKS_LOG_INFO(db_options_.info_log, "Searching path %s\n",
                     to_search_paths[path_id].c_str());
      Status status = env_->GetChildren(to_search_paths[path_id], &filenames);
      if (!status.ok()) {
        return status;
      }
      if (!filenames.empty()) {
        found_file = true;
      }

      uint64_t number;
      FileType type;
      for (size_t i = 0; i < filenames.size(); i++) {
        if (ParseFileName(filenames[i], &number, &type)) {
          if (type == kDescriptorFile) {
            manifests_.push_back(filenames[i]);
          } else {
            if (number + 1 > next_file_number_) {
              next_file_number_ = number + 1;
            }
            if (type == kWalFile) {
              logs_.push_back(number);
            } else if (type == kTableFile) {
              table_fds_.emplace_back(number, static_cast<uint32_t>(path_id),
                                      0);
            } else {
              // Ignore other files
            }
          }
        }
      }
    }
    if (!found_file) {
      return Status::Corruption(dbname_, "repair found no files");
    }
    return Status::OK();
  }

  void ConvertLogFilesToTables() {
    const auto& wal_dir = immutable_db_options_.GetWalDir();
    for (size_t i = 0; i < logs_.size(); i++) {
      // we should use LogFileName(wal_dir, logs_[i]) here. user might uses
      // wal_dir option.
      std::string logname = LogFileName(wal_dir, logs_[i]);
      Status status = ConvertLogToTable(wal_dir, logs_[i]);
      if (!status.ok()) {
        ROCKS_LOG_WARN(db_options_.info_log,
                       "Log #%" PRIu64 ": ignoring conversion error: %s",
                       logs_[i], status.ToString().c_str());
      }
      ArchiveFile(logname);
    }
  }

  Status ConvertLogToTable(const std::string& wal_dir, uint64_t log) {
    struct LogReporter : public log::Reader::Reporter {
      Env* env;
      std::shared_ptr<Logger> info_log;
      uint64_t lognum;
      void Corruption(size_t bytes, const Status& s) override {
        // We print error messages for corruption, but continue repairing.
        ROCKS_LOG_ERROR(info_log, "Log #%" PRIu64 ": dropping %d bytes; %s",
                        lognum, static_cast<int>(bytes), s.ToString().c_str());
      }
    };

    // TODO: plumb Env::IOActivity
    const ReadOptions read_options;

    // Open the log file
    std::string logname = LogFileName(wal_dir, log);
    const auto& fs = env_->GetFileSystem();
    std::unique_ptr<SequentialFileReader> lfile_reader;
    Status status = SequentialFileReader::Create(
        fs, logname, fs->OptimizeForLogRead(file_options_), &lfile_reader,
        nullptr /* dbg */, nullptr /* rate limiter */);
    if (!status.ok()) {
      return status;
    }

    // Create the log reader.
    LogReporter reporter;
    reporter.env = env_;
    reporter.info_log = db_options_.info_log;
    reporter.lognum = log;
    // We intentionally make log::Reader do checksumming so that
    // corruptions cause entire commits to be skipped instead of
    // propagating bad information (like overly large sequence
    // numbers).
    log::Reader reader(db_options_.info_log, std::move(lfile_reader), &reporter,
                       true /*enable checksum*/, log);

    // Initialize per-column family memtables
    for (auto* cfd : *vset_.GetColumnFamilySet()) {
      cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(),
                             kMaxSequenceNumber);
    }
    auto cf_mems = new ColumnFamilyMemTablesImpl(vset_.GetColumnFamilySet());

    // Read all the records and add to a memtable
    const UnorderedMap<uint32_t, size_t>& running_ts_sz =
        vset_.GetRunningColumnFamiliesTimestampSize();
    std::string scratch;
    Slice record;
    WriteBatch batch;

    int counter = 0;
    while (reader.ReadRecord(&record, &scratch)) {
      if (record.size() < WriteBatchInternal::kHeader) {
        reporter.Corruption(record.size(),
                            Status::Corruption("log record too small"));
        continue;
      }
      Status record_status = WriteBatchInternal::SetContents(&batch, record);
      if (record_status.ok()) {
        const UnorderedMap<uint32_t, size_t>& record_ts_sz =
            reader.GetRecordedTimestampSize();
        record_status = HandleWriteBatchTimestampSizeDifference(
            &batch, running_ts_sz, record_ts_sz,
            TimestampSizeConsistencyMode::kVerifyConsistency);
        if (record_status.ok()) {
          record_status =
              WriteBatchInternal::InsertInto(&batch, cf_mems, nullptr, nullptr);
        }
      }
      if (record_status.ok()) {
        counter += WriteBatchInternal::Count(&batch);
      } else {
        ROCKS_LOG_WARN(db_options_.info_log, "Log #%" PRIu64 ": ignoring %s",
                       log, record_status.ToString().c_str());
      }
    }

    // Dump a table for each column family with entries in this log file.
    for (auto* cfd : *vset_.GetColumnFamilySet()) {
      // Do not record a version edit for this conversion to a Table
      // since ExtractMetaData() will also generate edits.
      MemTable* mem = cfd->mem();
      if (mem->IsEmpty()) {
        continue;
      }

      FileMetaData meta;
      meta.fd = FileDescriptor(next_file_number_++, 0, 0);
      // TODO: plumb Env::IOActivity
      ReadOptions ro;
      ro.total_order_seek = true;
      Arena arena;
      ScopedArenaIterator iter(mem->NewIterator(ro, &arena));
      int64_t _current_time = 0;
      immutable_db_options_.clock->GetCurrentTime(&_current_time)
          .PermitUncheckedError();  // ignore error
      const uint64_t current_time = static_cast<uint64_t>(_current_time);
      meta.file_creation_time = current_time;
      SnapshotChecker* snapshot_checker = DisableGCSnapshotChecker::Instance();

      auto write_hint = cfd->CalculateSSTWriteHint(0);
      std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
          range_del_iters;
      auto range_del_iter = mem->NewRangeTombstoneIterator(
          ro, kMaxSequenceNumber, false /* immutable_memtable */);
      if (range_del_iter != nullptr) {
        range_del_iters.emplace_back(range_del_iter);
      }

      IOStatus io_s;
      CompressionOptions default_compression;
      TableBuilderOptions tboptions(
          *cfd->ioptions(), *cfd->GetLatestMutableCFOptions(),
          cfd->internal_comparator(), cfd->int_tbl_prop_collector_factories(),
          kNoCompression, default_compression, cfd->GetID(), cfd->GetName(),
          -1 /* level */, false /* is_bottommost */,
          TableFileCreationReason::kRecovery, 0 /* oldest_key_time */,
          0 /* file_creation_time */, "DB Repairer" /* db_id */, db_session_id_,
          0 /*target_file_size*/, meta.fd.GetNumber());

      SeqnoToTimeMapping empty_seqno_to_time_mapping;
      status = BuildTable(
          dbname_, /* versions */ nullptr, immutable_db_options_, tboptions,
          file_options_, read_options, table_cache_.get(), iter.get(),
          std::move(range_del_iters), &meta, nullptr /* blob_file_additions */,
          {}, kMaxSequenceNumber, kMaxSequenceNumber, snapshot_checker,
          false /* paranoid_file_checks*/, nullptr /* internal_stats */, &io_s,
          nullptr /*IOTracer*/, BlobFileCreationReason::kRecovery,
          empty_seqno_to_time_mapping, nullptr /* event_logger */,
          0 /* job_id */, Env::IO_HIGH, nullptr /* table_properties */,
          write_hint);
      ROCKS_LOG_INFO(db_options_.info_log,
                     "Log #%" PRIu64 ": %d ops saved to Table #%" PRIu64 " %s",
                     log, counter, meta.fd.GetNumber(),
                     status.ToString().c_str());
      if (status.ok()) {
        if (meta.fd.GetFileSize() > 0) {
          table_fds_.push_back(meta.fd);
        }
      } else {
        break;
      }
    }
    delete cf_mems;
    return status;
  }

  void ExtractMetaData() {
    for (size_t i = 0; i < table_fds_.size(); i++) {
      TableInfo t;
      t.meta.fd = table_fds_[i];
      Status status = ScanTable(&t);
      if (!status.ok()) {
        std::string fname = TableFileName(
            db_options_.db_paths, t.meta.fd.GetNumber(), t.meta.fd.GetPathId());
        char file_num_buf[kFormatFileNumberBufSize];
        FormatFileNumber(t.meta.fd.GetNumber(), t.meta.fd.GetPathId(),
                         file_num_buf, sizeof(file_num_buf));
        ROCKS_LOG_WARN(db_options_.info_log, "Table #%s: ignoring %s",
                       file_num_buf, status.ToString().c_str());
        ArchiveFile(fname);
      } else {
        tables_.push_back(t);
      }
    }
  }

  Status ScanTable(TableInfo* t) {
    std::string fname = TableFileName(
        db_options_.db_paths, t->meta.fd.GetNumber(), t->meta.fd.GetPathId());
    int counter = 0;
    uint64_t file_size;
    Status status = env_->GetFileSize(fname, &file_size);
    t->meta.fd = FileDescriptor(t->meta.fd.GetNumber(), t->meta.fd.GetPathId(),
                                file_size);
    std::shared_ptr<const TableProperties> props;
    if (status.ok()) {
      // TODO: plumb Env::IOActivity
      const ReadOptions read_options;
      status = table_cache_->GetTableProperties(
          file_options_, read_options, icmp_, t->meta, &props,
          0 /* block_protection_bytes_per_key */);
    }
    if (status.ok()) {
      auto s =
          GetSstInternalUniqueId(props->db_id, props->db_session_id,
                                 props->orig_file_number, &t->meta.unique_id);
      if (!s.ok()) {
        ROCKS_LOG_WARN(db_options_.info_log,
                       "Table #%" PRIu64
                       ": unable to get unique id, default to Unknown.",
                       t->meta.fd.GetNumber());
      }
      t->column_family_id = static_cast<uint32_t>(props->column_family_id);
      if (t->column_family_id ==
          TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) {
        ROCKS_LOG_WARN(
            db_options_.info_log,
            "Table #%" PRIu64
            ": column family unknown (probably due to legacy format); "
            "adding to default column family id 0.",
            t->meta.fd.GetNumber());
        t->column_family_id = 0;
      }

      if (vset_.GetColumnFamilySet()->GetColumnFamily(t->column_family_id) ==
          nullptr) {
        status =
            AddColumnFamily(props->column_family_name, t->column_family_id);
      }
      t->meta.oldest_ancester_time = props->creation_time;
      t->meta.user_defined_timestamps_persisted =
          static_cast<bool>(props->user_defined_timestamps_persisted);
    }
    if (status.ok()) {
      uint64_t tail_size = 0;
      bool contain_no_data_blocks =
          props->num_entries > 0 &&
          (props->num_entries == props->num_range_deletions);
      if (props->tail_start_offset > 0 || contain_no_data_blocks) {
        assert(props->tail_start_offset <= file_size);
        tail_size = file_size - props->tail_start_offset;
      }
      t->meta.tail_size = tail_size;
    }
    ColumnFamilyData* cfd = nullptr;
    if (status.ok()) {
      cfd = vset_.GetColumnFamilySet()->GetColumnFamily(t->column_family_id);
      if (cfd->GetName() != props->column_family_name) {
        ROCKS_LOG_ERROR(
            db_options_.info_log,
            "Table #%" PRIu64
            ": inconsistent column family name '%s'; expected '%s' for column "
            "family id %" PRIu32 ".",
            t->meta.fd.GetNumber(), props->column_family_name.c_str(),
            cfd->GetName().c_str(), t->column_family_id);
        status = Status::Corruption(dbname_, "inconsistent column family name");
      }
    }
    if (status.ok()) {
      // TODO: plumb Env::IOActivity
      ReadOptions ropts;
      ropts.total_order_seek = true;
      InternalIterator* iter = table_cache_->NewIterator(
          ropts, file_options_, cfd->internal_comparator(), t->meta,
          nullptr /* range_del_agg */,
          cfd->GetLatestMutableCFOptions()->prefix_extractor,
          /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr,
          TableReaderCaller::kRepair, /*arena=*/nullptr, /*skip_filters=*/false,
          /*level=*/-1, /*max_file_size_for_l0_meta_pin=*/0,
          /*smallest_compaction_key=*/nullptr,
          /*largest_compaction_key=*/nullptr,
          /*allow_unprepared_value=*/false,
          cfd->GetLatestMutableCFOptions()->block_protection_bytes_per_key);
      ParsedInternalKey parsed;
      for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
        Slice key = iter->key();
        Status pik_status =
            ParseInternalKey(key, &parsed, db_options_.allow_data_in_errors);
        if (!pik_status.ok()) {
          ROCKS_LOG_ERROR(db_options_.info_log,
                          "Table #%" PRIu64 ": unparsable key - %s",
                          t->meta.fd.GetNumber(), pik_status.getState());
          continue;
        }

        counter++;

        status = t->meta.UpdateBoundaries(key, iter->value(), parsed.sequence,
                                          parsed.type);
        if (!status.ok()) {
          break;
        }
      }
      if (status.ok() && !iter->status().ok()) {
        status = iter->status();
      }
      delete iter;

      ROCKS_LOG_INFO(db_options_.info_log, "Table #%" PRIu64 ": %d entries %s",
                     t->meta.fd.GetNumber(), counter,
                     status.ToString().c_str());
    }
    if (status.ok()) {
      // XXX/FIXME: This is just basic, naive handling of range tombstones,
      // like call to UpdateBoundariesForRange in builder.cc where we assume
      // an SST file is a full sorted run. This probably needs the extra logic
      // from compaction_job.cc around call to UpdateBoundariesForRange (to
      // handle range tombstones extendingg beyond range of other entries).
      // TODO: plumb Env::IOActivity
      ReadOptions ropts;
      std::unique_ptr<FragmentedRangeTombstoneIterator> r_iter;
      status = table_cache_->GetRangeTombstoneIterator(
          ropts, cfd->internal_comparator(), t->meta,
          cfd->GetLatestMutableCFOptions()->block_protection_bytes_per_key,
          &r_iter);

      if (r_iter) {
        r_iter->SeekToFirst();

        while (r_iter->Valid()) {
          auto tombstone = r_iter->Tombstone();
          auto kv = tombstone.Serialize();
          t->meta.UpdateBoundariesForRange(
              kv.first, tombstone.SerializeEndKey(), tombstone.seq_,
              cfd->internal_comparator());
          r_iter->Next();
        }
      }
    }
    return status;
  }

  Status AddTables() {
    // TODO: plumb Env::IOActivity;
    const ReadOptions read_options;
    std::unordered_map<uint32_t, std::vector<const TableInfo*>> cf_id_to_tables;
    SequenceNumber max_sequence = 0;
    for (size_t i = 0; i < tables_.size(); i++) {
      cf_id_to_tables[tables_[i].column_family_id].push_back(&tables_[i]);
      if (max_sequence < tables_[i].meta.fd.largest_seqno) {
        max_sequence = tables_[i].meta.fd.largest_seqno;
      }
    }
    vset_.SetLastAllocatedSequence(max_sequence);
    vset_.SetLastPublishedSequence(max_sequence);
    vset_.SetLastSequence(max_sequence);

    for (const auto& cf_id_and_tables : cf_id_to_tables) {
      auto* cfd =
          vset_.GetColumnFamilySet()->GetColumnFamily(cf_id_and_tables.first);

      // Recover files' epoch number using dummy VersionStorageInfo
      VersionBuilder dummy_version_builder(
          cfd->current()->version_set()->file_options(), cfd->ioptions(),
          cfd->table_cache(), cfd->current()->storage_info(),
          cfd->current()->version_set(),
          cfd->GetFileMetadataCacheReservationManager());
      VersionStorageInfo dummy_vstorage(
          &cfd->internal_comparator(), cfd->user_comparator(),
          cfd->NumberLevels(), cfd->ioptions()->compaction_style,
          nullptr /* src_vstorage */, cfd->ioptions()->force_consistency_checks,
          EpochNumberRequirement::kMightMissing, cfd->ioptions()->clock,
          /*bottommost_file_compaction_delay=*/0,
          cfd->current()->version_set()->offpeak_time_option());
      Status s;
      VersionEdit dummy_edit;
      for (const auto* table : cf_id_and_tables.second) {
        // TODO(opt): separate out into multiple levels
        dummy_edit.AddFile(
            0, table->meta.fd.GetNumber(), table->meta.fd.GetPathId(),
            table->meta.fd.GetFileSize(), table->meta.smallest,
            table->meta.largest, table->meta.fd.smallest_seqno,
            table->meta.fd.largest_seqno, table->meta.marked_for_compaction,
            table->meta.temperature, table->meta.oldest_blob_file_number,
            table->meta.oldest_ancester_time, table->meta.file_creation_time,
            table->meta.epoch_number, table->meta.file_checksum,
            table->meta.file_checksum_func_name, table->meta.unique_id,
            table->meta.compensated_range_deletion_size, table->meta.tail_size,
            table->meta.user_defined_timestamps_persisted);
      }
      s = dummy_version_builder.Apply(&dummy_edit);
      if (s.ok()) {
        s = dummy_version_builder.SaveTo(&dummy_vstorage);
      }
      if (s.ok()) {
        dummy_vstorage.RecoverEpochNumbers(cfd);
      }
      if (s.ok()) {
        // Record changes from this repair in VersionEdit, including files with
        // recovered epoch numbers
        VersionEdit edit;
        edit.SetComparatorName(cfd->user_comparator()->Name());
        edit.SetPersistUserDefinedTimestamps(
            cfd->ioptions()->persist_user_defined_timestamps);
        edit.SetLogNumber(0);
        edit.SetNextFile(next_file_number_);
        edit.SetColumnFamily(cfd->GetID());
        for (int level = 0; level < dummy_vstorage.num_levels(); ++level) {
          for (FileMetaData* file_meta : dummy_vstorage.LevelFiles(level)) {
            edit.AddFile(level, *file_meta);
          }
        }

        // Release resources occupied by the dummy VersionStorageInfo
        for (int level = 0; level < dummy_vstorage.num_levels(); ++level) {
          for (FileMetaData* file_meta : dummy_vstorage.LevelFiles(level)) {
            file_meta->refs--;
            if (file_meta->refs <= 0) {
              delete file_meta;
            }
          }
        }

        // Persist record of changes
        assert(next_file_number_ > 0);
        vset_.MarkFileNumberUsed(next_file_number_ - 1);
        mutex_.Lock();
        std::unique_ptr<FSDirectory> db_dir;
        s = env_->GetFileSystem()->NewDirectory(dbname_, IOOptions(), &db_dir,
                                                nullptr);
        if (s.ok()) {
          s = vset_.LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
                                read_options, &edit, &mutex_, db_dir.get(),
                                false /* new_descriptor_log */);
        }
        mutex_.Unlock();
      }
      if (!s.ok()) {
        return s;
      }
    }
    return Status::OK();
  }

  void ArchiveFile(const std::string& fname) {
    // Move into another directory.  E.g., for
    //    dir/foo
    // rename to
    //    dir/lost/foo
    const char* slash = strrchr(fname.c_str(), '/');
    std::string new_dir;
    if (slash != nullptr) {
      new_dir.assign(fname.data(), slash - fname.data());
    }
    new_dir.append("/lost");
    env_->CreateDir(new_dir).PermitUncheckedError();  // Ignore error
    std::string new_file = new_dir;
    new_file.append("/");
    new_file.append((slash == nullptr) ? fname.c_str() : slash + 1);
    Status s = env_->RenameFile(fname, new_file);
    ROCKS_LOG_INFO(db_options_.info_log, "Archiving %s: %s\n", fname.c_str(),
                   s.ToString().c_str());
  }
};

Status GetDefaultCFOptions(
    const std::vector<ColumnFamilyDescriptor>& column_families,
    ColumnFamilyOptions* res) {
  assert(res != nullptr);
  auto iter = std::find_if(column_families.begin(), column_families.end(),
                           [](const ColumnFamilyDescriptor& cfd) {
                             return cfd.name == kDefaultColumnFamilyName;
                           });
  if (iter == column_families.end()) {
    return Status::InvalidArgument(
        "column_families", "Must contain entry for default column family");
  }
  *res = iter->options;
  return Status::OK();
}
}  // anonymous namespace

Status RepairDB(const std::string& dbname, const DBOptions& db_options,
                const std::vector<ColumnFamilyDescriptor>& column_families) {
  ColumnFamilyOptions default_cf_opts;
  Status status = GetDefaultCFOptions(column_families, &default_cf_opts);
  if (!status.ok()) {
    return status;
  }

  Repairer repairer(dbname, db_options, column_families, default_cf_opts,
                    ColumnFamilyOptions() /* unknown_cf_opts */,
                    false /* create_unknown_cfs */);
  status = repairer.Run();
  if (status.ok()) {
    status = repairer.Close();
  }
  return status;
}

Status RepairDB(const std::string& dbname, const DBOptions& db_options,
                const std::vector<ColumnFamilyDescriptor>& column_families,
                const ColumnFamilyOptions& unknown_cf_opts) {
  ColumnFamilyOptions default_cf_opts;
  Status status = GetDefaultCFOptions(column_families, &default_cf_opts);
  if (!status.ok()) {
    return status;
  }

  Repairer repairer(dbname, db_options, column_families, default_cf_opts,
                    unknown_cf_opts, true /* create_unknown_cfs */);
  status = repairer.Run();
  if (status.ok()) {
    status = repairer.Close();
  }
  return status;
}

Status RepairDB(const std::string& dbname, const Options& options) {
  Options opts(options);
  DBOptions db_options(opts);
  ColumnFamilyOptions cf_options(opts);

  Repairer repairer(dbname, db_options, {}, cf_options /* default_cf_opts */,
                    cf_options /* unknown_cf_opts */,
                    true /* create_unknown_cfs */);
  Status status = repairer.Run();
  if (status.ok()) {
    status = repairer.Close();
  }
  return status;
}

}  // namespace ROCKSDB_NAMESPACE