rocksdb/db/repair.cc
Andrew Kryczka 343507afb1 Refactor to use VersionSet [CF + RepairDB part 1/3]
Summary:
To support column families, it is easiest to use VersionSet to manage
our column families (if we don't have Versions then ColumnFamilyData always
behaves as a dummy column family). This diff only refactors the existing repair
logic to use VersionSet; the next two parts will add support for multiple
column families.

Test Plan:
  $ ./repair_test

Reviewers: yhchiang, IslamAbdelRahman, sdong

Reviewed By: sdong

Subscribers: andrewkr, dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D59775
2016-06-24 11:19:40 -07:00

490 lines
16 KiB
C++

// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// Repairer does best effort recovery to recover as much data as possible after
// a disaster without compromising consistency. It does not guarantee bringing
// the database to a time consistent state.
//
// Repair process is broken into 4 phases:
// (a) Find files
// (b) Convert logs to tables
// (c) Extract metadata
// (d) Write Descriptor
//
// (a) Find files
//
// The repairer goes through all the files in the directory, and classifies them
// based on their file name. Any file that cannot be identified by name will be
// ignored.
//
// (b) Convert logs to table
//
// Every log file that is active is replayed. All sections of the file where the
// checksum does not match is skipped over. We intentionally give preference to
// data consistency.
//
// (c) Extract metadata
//
// We scan every table to compute
// (1) smallest/largest for the table
// (2) largest sequence number in the table
//
// If we are unable to scan the file, then we ignore the table.
//
// (d) Write Descriptor
//
// We generate descriptor contents:
// - log number is set to zero
// - next-file-number is set to 1 + largest file number we found
// - last-sequence-number is set to largest sequence# found across
// all tables (see 2c)
// - compaction pointers are cleared
// - every table file is added at level 0
//
// Possible optimization 1:
// (a) Compute total size and use to pick appropriate max-level M
// (b) Sort tables by largest sequence# in the table
// (c) For each table: if it overlaps earlier table, place in level-0,
// else place in level-M.
// (d) We can provide options for time consistent recovery and unsafe recovery
// (ignore checksum failure when applicable)
// Possible optimization 2:
// Store per-table metadata (smallest, largest, largest-seq#, ...)
// in the table's meta section to speed up ScanTable.
#ifndef ROCKSDB_LITE
#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS
#endif
#include <inttypes.h>
#include "db/builder.h"
#include "db/db_impl.h"
#include "db/dbformat.h"
#include "db/filename.h"
#include "db/log_reader.h"
#include "db/log_writer.h"
#include "db/memtable.h"
#include "db/table_cache.h"
#include "db/version_edit.h"
#include "db/writebuffer.h"
#include "db/write_batch_internal.h"
#include "rocksdb/comparator.h"
#include "rocksdb/db.h"
#include "rocksdb/env.h"
#include "rocksdb/options.h"
#include "rocksdb/immutable_options.h"
#include "table/scoped_arena_iterator.h"
#include "util/file_reader_writer.h"
namespace rocksdb {
namespace {
class Repairer {
public:
Repairer(const std::string& dbname, const Options& options)
: dbname_(dbname),
env_(options.env),
icmp_(options.comparator),
cf_options_(options),
options_(SanitizeOptions(dbname, &icmp_, options)),
ioptions_(options_),
env_options_(),
raw_table_cache_(
// TableCache can be small since we expect each table to be opened
// once.
NewLRUCache(10, options_.table_cache_numshardbits)),
table_cache_(
new TableCache(ioptions_, env_options_, raw_table_cache_.get())),
wb_(options_.db_write_buffer_size),
wc_(options_.delayed_write_rate),
vset_(dbname_, &options_, env_options_, raw_table_cache_.get(), &wb_,
&wc_),
next_file_number_(1) {
GetIntTblPropCollectorFactory(options, &int_tbl_prop_collector_factories_);
}
// Adds a column family to the VersionSet with cf_options_ and updates
// manifest.
Status AddColumnFamily(const std::string& cf_name, uint32_t cf_id) {
MutableCFOptions mut_cf_opts(options_, ioptions_);
VersionEdit edit;
edit.SetComparatorName(icmp_.user_comparator()->Name());
edit.SetLogNumber(0);
edit.SetColumnFamily(cf_id);
ColumnFamilyData* cfd;
cfd = nullptr;
edit.AddColumnFamily(cf_name);
mutex_.Lock();
Status status = vset_.LogAndApply(
cfd, mut_cf_opts, &edit, &mutex_, nullptr /* db_directory */,
false /* new_descriptor_log */, &cf_options_);
mutex_.Unlock();
return status;
}
~Repairer() {
delete table_cache_;
}
Status Run() {
Status status = FindFiles();
if (status.ok()) {
// Discard older manifests and start a fresh one
for (size_t i = 0; i < manifests_.size(); i++) {
ArchiveFile(dbname_ + "/" + manifests_[i]);
}
// Just create a DBImpl temporarily so we can reuse NewDB()
DBImpl* db_impl = new DBImpl(options_, dbname_);
status = db_impl->NewDB();
delete db_impl;
}
if (status.ok()) {
// Recover using the fresh manifest created by NewDB()
status = vset_.Recover({{kDefaultColumnFamilyName, cf_options_}}, false);
}
if (status.ok()) {
ConvertLogFilesToTables();
ExtractMetaData();
status = AddTables();
}
if (status.ok()) {
uint64_t bytes = 0;
for (size_t i = 0; i < tables_.size(); i++) {
bytes += tables_[i].meta.fd.GetFileSize();
}
Log(InfoLogLevel::WARN_LEVEL, options_.info_log,
"**** Repaired rocksdb %s; "
"recovered %" ROCKSDB_PRIszt " files; %" PRIu64
"bytes. "
"Some data may have been lost. "
"****",
dbname_.c_str(), tables_.size(), bytes);
}
return status;
}
private:
struct TableInfo {
FileMetaData meta;
SequenceNumber min_sequence;
SequenceNumber max_sequence;
};
std::string const dbname_;
Env* const env_;
const InternalKeyComparator icmp_;
std::vector<std::unique_ptr<IntTblPropCollectorFactory>>
int_tbl_prop_collector_factories_;
const ColumnFamilyOptions cf_options_; // unsanitized
const Options options_; // sanitized
const ImmutableCFOptions ioptions_; // sanitized
const EnvOptions env_options_;
std::shared_ptr<Cache> raw_table_cache_;
TableCache* table_cache_;
WriteBuffer wb_;
WriteController wc_;
VersionSet vset_;
InstrumentedMutex mutex_;
std::vector<std::string> manifests_;
std::vector<FileDescriptor> table_fds_;
std::vector<uint64_t> logs_;
std::vector<TableInfo> tables_;
uint64_t next_file_number_;
Status FindFiles() {
std::vector<std::string> filenames;
bool found_file = false;
for (size_t path_id = 0; path_id < options_.db_paths.size(); path_id++) {
Status status =
env_->GetChildren(options_.db_paths[path_id].path, &filenames);
if (!status.ok()) {
return status;
}
if (!filenames.empty()) {
found_file = true;
}
uint64_t number;
FileType type;
for (size_t i = 0; i < filenames.size(); i++) {
if (ParseFileName(filenames[i], &number, &type)) {
if (type == kDescriptorFile) {
assert(path_id == 0);
manifests_.push_back(filenames[i]);
} else {
if (number + 1 > next_file_number_) {
next_file_number_ = number + 1;
}
if (type == kLogFile) {
assert(path_id == 0);
logs_.push_back(number);
} else if (type == kTableFile) {
table_fds_.emplace_back(number, static_cast<uint32_t>(path_id),
0);
} else {
// Ignore other files
}
}
}
}
}
if (!found_file) {
return Status::Corruption(dbname_, "repair found no files");
}
return Status::OK();
}
void ConvertLogFilesToTables() {
for (size_t i = 0; i < logs_.size(); i++) {
std::string logname = LogFileName(dbname_, logs_[i]);
Status status = ConvertLogToTable(logs_[i]);
if (!status.ok()) {
Log(InfoLogLevel::WARN_LEVEL, options_.info_log,
"Log #%" PRIu64 ": ignoring conversion error: %s", logs_[i],
status.ToString().c_str());
}
ArchiveFile(logname);
}
}
Status ConvertLogToTable(uint64_t log) {
struct LogReporter : public log::Reader::Reporter {
Env* env;
std::shared_ptr<Logger> info_log;
uint64_t lognum;
virtual void Corruption(size_t bytes, const Status& s) override {
// We print error messages for corruption, but continue repairing.
Log(InfoLogLevel::ERROR_LEVEL, info_log,
"Log #%" PRIu64 ": dropping %d bytes; %s", lognum,
static_cast<int>(bytes), s.ToString().c_str());
}
};
// Open the log file
std::string logname = LogFileName(dbname_, log);
unique_ptr<SequentialFile> lfile;
Status status = env_->NewSequentialFile(logname, &lfile, env_options_);
if (!status.ok()) {
return status;
}
unique_ptr<SequentialFileReader> lfile_reader(
new SequentialFileReader(std::move(lfile)));
// Create the log reader.
LogReporter reporter;
reporter.env = env_;
reporter.info_log = options_.info_log;
reporter.lognum = log;
// We intentionally make log::Reader do checksumming so that
// corruptions cause entire commits to be skipped instead of
// propagating bad information (like overly large sequence
// numbers).
log::Reader reader(options_.info_log, std::move(lfile_reader), &reporter,
true /*enable checksum*/, 0 /*initial_offset*/, log);
// Read all the records and add to a memtable
std::string scratch;
Slice record;
WriteBatch batch;
WriteBuffer wb(options_.db_write_buffer_size);
MemTable* mem =
new MemTable(icmp_, ioptions_, MutableCFOptions(options_, ioptions_),
&wb, kMaxSequenceNumber);
auto cf_mems_default = new ColumnFamilyMemTablesDefault(mem);
mem->Ref();
int counter = 0;
while (reader.ReadRecord(&record, &scratch)) {
if (record.size() < WriteBatchInternal::kHeader) {
reporter.Corruption(
record.size(), Status::Corruption("log record too small"));
continue;
}
WriteBatchInternal::SetContents(&batch, record);
status = WriteBatchInternal::InsertInto(&batch, cf_mems_default, nullptr);
if (status.ok()) {
counter += WriteBatchInternal::Count(&batch);
} else {
Log(InfoLogLevel::WARN_LEVEL,
options_.info_log, "Log #%" PRIu64 ": ignoring %s", log,
status.ToString().c_str());
status = Status::OK(); // Keep going with rest of file
}
}
// Do not record a version edit for this conversion to a Table
// since ExtractMetaData() will also generate edits.
FileMetaData meta;
meta.fd = FileDescriptor(next_file_number_++, 0, 0);
{
ReadOptions ro;
ro.total_order_seek = true;
Arena arena;
ScopedArenaIterator iter(mem->NewIterator(ro, &arena));
MutableCFOptions mutable_cf_options(options_, ioptions_);
status = BuildTable(
dbname_, env_, ioptions_, mutable_cf_options, env_options_,
table_cache_, iter.get(), &meta, icmp_,
&int_tbl_prop_collector_factories_,
TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
std::string() /* column_family_name */, {}, kMaxSequenceNumber,
kNoCompression, CompressionOptions(), false,
nullptr /* internal_stats */, TableFileCreationReason::kRecovery);
}
delete mem->Unref();
delete cf_mems_default;
mem = nullptr;
if (status.ok()) {
if (meta.fd.GetFileSize() > 0) {
table_fds_.push_back(meta.fd);
}
}
Log(InfoLogLevel::INFO_LEVEL, options_.info_log,
"Log #%" PRIu64 ": %d ops saved to Table #%" PRIu64 " %s",
log, counter, meta.fd.GetNumber(), status.ToString().c_str());
return status;
}
void ExtractMetaData() {
for (size_t i = 0; i < table_fds_.size(); i++) {
TableInfo t;
t.meta.fd = table_fds_[i];
Status status = ScanTable(&t);
if (!status.ok()) {
std::string fname = TableFileName(
options_.db_paths, t.meta.fd.GetNumber(), t.meta.fd.GetPathId());
char file_num_buf[kFormatFileNumberBufSize];
FormatFileNumber(t.meta.fd.GetNumber(), t.meta.fd.GetPathId(),
file_num_buf, sizeof(file_num_buf));
Log(InfoLogLevel::WARN_LEVEL, options_.info_log,
"Table #%s: ignoring %s", file_num_buf,
status.ToString().c_str());
ArchiveFile(fname);
} else {
tables_.push_back(t);
}
}
}
Status ScanTable(TableInfo* t) {
std::string fname = TableFileName(options_.db_paths, t->meta.fd.GetNumber(),
t->meta.fd.GetPathId());
int counter = 0;
uint64_t file_size;
Status status = env_->GetFileSize(fname, &file_size);
t->meta.fd = FileDescriptor(t->meta.fd.GetNumber(), t->meta.fd.GetPathId(),
file_size);
if (status.ok()) {
InternalIterator* iter = table_cache_->NewIterator(
ReadOptions(), env_options_, icmp_, t->meta.fd);
bool empty = true;
ParsedInternalKey parsed;
t->min_sequence = 0;
t->max_sequence = 0;
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
Slice key = iter->key();
if (!ParseInternalKey(key, &parsed)) {
Log(InfoLogLevel::ERROR_LEVEL,
options_.info_log, "Table #%" PRIu64 ": unparsable key %s",
t->meta.fd.GetNumber(), EscapeString(key).c_str());
continue;
}
counter++;
if (empty) {
empty = false;
t->meta.smallest.DecodeFrom(key);
}
t->meta.largest.DecodeFrom(key);
if (parsed.sequence < t->min_sequence) {
t->min_sequence = parsed.sequence;
}
if (parsed.sequence > t->max_sequence) {
t->max_sequence = parsed.sequence;
}
}
if (!iter->status().ok()) {
status = iter->status();
}
delete iter;
}
Log(InfoLogLevel::INFO_LEVEL,
options_.info_log, "Table #%" PRIu64 ": %d entries %s",
t->meta.fd.GetNumber(), counter, status.ToString().c_str());
return status;
}
Status AddTables() {
SequenceNumber max_sequence = 0;
for (size_t i = 0; i < tables_.size(); i++) {
if (max_sequence < tables_[i].max_sequence) {
max_sequence = tables_[i].max_sequence;
}
}
vset_.SetLastSequence(max_sequence);
auto* cfd = vset_.GetColumnFamilySet()->GetDefault();
VersionEdit edit;
edit.SetComparatorName(cfd->user_comparator()->Name());
edit.SetLogNumber(0);
edit.SetNextFile(next_file_number_);
edit.SetColumnFamily(cfd->GetID());
// TODO(opt): separate out into multiple levels
for (const auto& table : tables_) {
edit.AddFile(0, table.meta.fd.GetNumber(), table.meta.fd.GetPathId(),
table.meta.fd.GetFileSize(), table.meta.smallest,
table.meta.largest, table.min_sequence, table.max_sequence,
table.meta.marked_for_compaction);
}
mutex_.Lock();
Status status = vset_.LogAndApply(
cfd, *cfd->GetLatestMutableCFOptions(), &edit, &mutex_,
nullptr /* db_directory */, false /* new_descriptor_log */);
mutex_.Unlock();
return status;
}
void ArchiveFile(const std::string& fname) {
// Move into another directory. E.g., for
// dir/foo
// rename to
// dir/lost/foo
const char* slash = strrchr(fname.c_str(), '/');
std::string new_dir;
if (slash != nullptr) {
new_dir.assign(fname.data(), slash - fname.data());
}
new_dir.append("/lost");
env_->CreateDir(new_dir); // Ignore error
std::string new_file = new_dir;
new_file.append("/");
new_file.append((slash == nullptr) ? fname.c_str() : slash + 1);
Status s = env_->RenameFile(fname, new_file);
Log(InfoLogLevel::INFO_LEVEL,
options_.info_log, "Archiving %s: %s\n",
fname.c_str(), s.ToString().c_str());
}
};
} // anonymous namespace
Status RepairDB(const std::string& dbname, const Options& options) {
Repairer repairer(dbname, options);
return repairer.Run();
}
} // namespace rocksdb
#endif // ROCKSDB_LITE