ReadaheadRandomAccessFile -- userspace readahead

Summary:
ReadaheadRandomAccessFile acts as a transparent layer on top of RandomAccessFile. When a Read() request is issued, it issues a much bigger request to the OS and caches the result. When a new request comes in and we already have the data cached, it doesn't have to issue any requests to the OS.

We add ReadaheadRandomAccessFile layer only when file is read during compactions.

D45105 was incorrectly closed by Phabricator because I committed it to a separate branch (not master), so I'm resubmitting the diff.

Test Plan: make check

Reviewers: MarkCallaghan, sdong

Reviewed By: sdong

Subscribers: leveldb, dhruba

Differential Revision: https://reviews.facebook.net/D45123
This commit is contained in:
Igor Canadi 2015-08-26 15:25:59 -07:00
parent 16ebe3a2a9
commit 5f4166c90e
12 changed files with 143 additions and 15 deletions

View File

@ -357,6 +357,11 @@ DEFINE_int32(open_files, rocksdb::Options().max_open_files,
"Maximum number of files to keep open at the same time" "Maximum number of files to keep open at the same time"
" (use default if == 0)"); " (use default if == 0)");
DEFINE_int32(new_table_reader_for_compaction_inputs, true,
"If true, uses a separate file handle for compaction inputs");
DEFINE_int32(compaction_readahead_size, 0, "Compaction readahead size");
DEFINE_int32(bloom_bits, -1, "Bloom filter bits per key. Negative means" DEFINE_int32(bloom_bits, -1, "Bloom filter bits per key. Negative means"
" use default settings."); " use default settings.");
DEFINE_int32(memtable_bloom_bits, 0, "Bloom filter bits per key for memtable. " DEFINE_int32(memtable_bloom_bits, 0, "Bloom filter bits per key for memtable. "
@ -2191,6 +2196,9 @@ class Benchmark {
options.memtable_prefix_bloom_bits = FLAGS_memtable_bloom_bits; options.memtable_prefix_bloom_bits = FLAGS_memtable_bloom_bits;
options.bloom_locality = FLAGS_bloom_locality; options.bloom_locality = FLAGS_bloom_locality;
options.max_open_files = FLAGS_open_files; options.max_open_files = FLAGS_open_files;
options.new_table_reader_for_compaction_inputs =
FLAGS_new_table_reader_for_compaction_inputs;
options.compaction_readahead_size = FLAGS_compaction_readahead_size;
options.statistics = dbstats; options.statistics = dbstats;
if (FLAGS_enable_io_prio) { if (FLAGS_enable_io_prio) {
FLAGS_env->LowerThreadPoolIOPriority(Env::LOW); FLAGS_env->LowerThreadPoolIOPriority(Env::LOW);

View File

@ -156,6 +156,10 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) {
result.db_paths.emplace_back(dbname, std::numeric_limits<uint64_t>::max()); result.db_paths.emplace_back(dbname, std::numeric_limits<uint64_t>::max());
} }
if (result.compaction_readahead_size > 0) {
result.new_table_reader_for_compaction_inputs = true;
}
return result; return result;
} }

View File

@ -85,15 +85,19 @@ void TableCache::ReleaseHandle(Cache::Handle* handle) {
Status TableCache::GetTableReader( Status TableCache::GetTableReader(
const EnvOptions& env_options, const EnvOptions& env_options,
const InternalKeyComparator& internal_comparator, const FileDescriptor& fd, const InternalKeyComparator& internal_comparator, const FileDescriptor& fd,
bool advise_random_on_open, bool record_read_stats, bool sequential_mode, bool record_read_stats, HistogramImpl* file_read_hist,
HistogramImpl* file_read_hist, unique_ptr<TableReader>* table_reader) { unique_ptr<TableReader>* table_reader) {
std::string fname = std::string fname =
TableFileName(ioptions_.db_paths, fd.GetNumber(), fd.GetPathId()); TableFileName(ioptions_.db_paths, fd.GetNumber(), fd.GetPathId());
unique_ptr<RandomAccessFile> file; unique_ptr<RandomAccessFile> file;
Status s = ioptions_.env->NewRandomAccessFile(fname, &file, env_options); Status s = ioptions_.env->NewRandomAccessFile(fname, &file, env_options);
if (sequential_mode && ioptions_.compaction_readahead_size > 0) {
file = NewReadaheadRandomAccessFile(std::move(file),
ioptions_.compaction_readahead_size);
}
RecordTick(ioptions_.statistics, NO_FILE_OPENS); RecordTick(ioptions_.statistics, NO_FILE_OPENS);
if (s.ok()) { if (s.ok()) {
if (advise_random_on_open) { if (!sequential_mode && ioptions_.advise_random_on_open) {
file->Hint(RandomAccessFile::RANDOM); file->Hint(RandomAccessFile::RANDOM);
} }
StopWatch sw(ioptions_.env, ioptions_.statistics, TABLE_OPEN_IO_MICROS); StopWatch sw(ioptions_.env, ioptions_.statistics, TABLE_OPEN_IO_MICROS);
@ -128,7 +132,7 @@ Status TableCache::FindTable(const EnvOptions& env_options,
} }
unique_ptr<TableReader> table_reader; unique_ptr<TableReader> table_reader;
s = GetTableReader(env_options, internal_comparator, fd, s = GetTableReader(env_options, internal_comparator, fd,
ioptions_.advise_random_on_open, record_read_stats, false /* sequential mode */, record_read_stats,
file_read_hist, &table_reader); file_read_hist, &table_reader);
if (!s.ok()) { if (!s.ok()) {
assert(table_reader == nullptr); assert(table_reader == nullptr);
@ -162,8 +166,9 @@ Iterator* TableCache::NewIterator(const ReadOptions& options,
(for_compaction && ioptions_.new_table_reader_for_compaction_inputs); (for_compaction && ioptions_.new_table_reader_for_compaction_inputs);
if (create_new_table_reader) { if (create_new_table_reader) {
unique_ptr<TableReader> table_reader_unique_ptr; unique_ptr<TableReader> table_reader_unique_ptr;
Status s = GetTableReader(env_options, icomparator, fd, false, false, Status s = GetTableReader(
nullptr, &table_reader_unique_ptr); env_options, icomparator, fd, /* sequential mode */ true,
/* record stats */ false, nullptr, &table_reader_unique_ptr);
if (!s.ok()) { if (!s.ok()) {
return NewErrorIterator(s, arena); return NewErrorIterator(s, arena);
} }

View File

@ -61,13 +61,6 @@ class TableCache {
// Evict any entry for the specified file number // Evict any entry for the specified file number
static void Evict(Cache* cache, uint64_t file_number); static void Evict(Cache* cache, uint64_t file_number);
// Build a table reader
Status GetTableReader(const EnvOptions& env_options,
const InternalKeyComparator& internal_comparator,
const FileDescriptor& fd, bool advise_random_on_open,
bool record_read_stats, HistogramImpl* file_read_hist,
unique_ptr<TableReader>* table_reader);
// Find table reader // Find table reader
Status FindTable(const EnvOptions& toptions, Status FindTable(const EnvOptions& toptions,
const InternalKeyComparator& internal_comparator, const InternalKeyComparator& internal_comparator,
@ -101,6 +94,13 @@ class TableCache {
void ReleaseHandle(Cache::Handle* handle); void ReleaseHandle(Cache::Handle* handle);
private: private:
// Build a table reader
Status GetTableReader(const EnvOptions& env_options,
const InternalKeyComparator& internal_comparator,
const FileDescriptor& fd, bool sequential_mode,
bool record_read_stats, HistogramImpl* file_read_hist,
unique_ptr<TableReader>* table_reader);
const ImmutableCFOptions& ioptions_; const ImmutableCFOptions& ioptions_;
const EnvOptions& env_options_; const EnvOptions& env_options_;
Cache* const cache_; Cache* const cache_;

View File

@ -91,6 +91,8 @@ struct ImmutableCFOptions {
bool new_table_reader_for_compaction_inputs; bool new_table_reader_for_compaction_inputs;
size_t compaction_readahead_size;
int num_levels; int num_levels;
bool optimize_filters_for_hits; bool optimize_filters_for_hits;

View File

@ -1038,6 +1038,16 @@ struct DBOptions {
// Default: false // Default: false
bool new_table_reader_for_compaction_inputs; bool new_table_reader_for_compaction_inputs;
// If non-zero, we perform bigger reads when doing compaction. If you're
// running RocksDB on spinning disks, you should set this to at least 2MB.
// That way RocksDB's compaction is doing sequential instead of random reads.
//
// When non-zero, we also force new_table_reader_for_compaction_inputs to
// true.
//
// Default: 0
size_t compaction_readahead_size;
// Use adaptive mutex, which spins in the user space before resorting // Use adaptive mutex, which spins in the user space before resorting
// to kernel. This could reduce context switch when the mutex is not // to kernel. This could reduce context switch when the mutex is not
// heavily contended. However, if the mutex is hot, we could end up // heavily contended. However, if the mutex is hot, we could end up

View File

@ -231,6 +231,7 @@ Options DBTestBase::CurrentOptions(
case kFullFilterWithNewTableReaderForCompactions: case kFullFilterWithNewTableReaderForCompactions:
table_options.filter_policy.reset(NewBloomFilterPolicy(10, false)); table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
options.new_table_reader_for_compaction_inputs = true; options.new_table_reader_for_compaction_inputs = true;
options.compaction_readahead_size = 10 * 1024 * 1024;
break; break;
case kUncompressed: case kUncompressed:
options.compression = kNoCompression; options.compression = kNoCompression;

View File

@ -10,6 +10,8 @@
#include "util/file_reader_writer.h" #include "util/file_reader_writer.h"
#include <algorithm> #include <algorithm>
#include <mutex>
#include "port/port.h" #include "port/port.h"
#include "util/histogram.h" #include "util/histogram.h"
#include "util/iostats_context_imp.h" #include "util/iostats_context_imp.h"
@ -222,4 +224,86 @@ size_t WritableFileWriter::RequestToken(size_t bytes) {
} }
return bytes; return bytes;
} }
namespace {
class ReadaheadRandomAccessFile : public RandomAccessFile {
public:
ReadaheadRandomAccessFile(std::unique_ptr<RandomAccessFile> file,
size_t readahead_size)
: file_(std::move(file)),
readahead_size_(readahead_size),
buffer_(new char[readahead_size_]),
buffer_offset_(0),
buffer_len_(0) {}
virtual Status Read(uint64_t offset, size_t n, Slice* result,
char* scratch) const override {
if (n >= readahead_size_) {
return file_->Read(offset, n, result, scratch);
}
std::unique_lock<std::mutex> lk(lock_);
size_t copied = 0;
// if offset between [buffer_offset_, buffer_offset_ + buffer_len>
if (offset >= buffer_offset_ && offset < buffer_len_ + buffer_offset_) {
uint64_t offset_in_buffer = offset - buffer_offset_;
copied = std::min(static_cast<uint64_t>(buffer_len_) - offset_in_buffer,
static_cast<uint64_t>(n));
memcpy(scratch, buffer_.get() + offset_in_buffer, copied);
if (copied == n) {
// fully cached
*result = Slice(scratch, n);
return Status::OK();
}
}
Slice readahead_result;
Status s = file_->Read(offset + copied, readahead_size_, &readahead_result,
buffer_.get());
if (!s.ok()) {
return s;
}
auto left_to_copy = std::min(readahead_result.size(), n - copied);
memcpy(scratch + copied, readahead_result.data(), left_to_copy);
*result = Slice(scratch, copied + left_to_copy);
if (readahead_result.data() == buffer_.get()) {
buffer_offset_ = offset + copied;
buffer_len_ = readahead_result.size();
} else {
buffer_len_ = 0;
}
return Status::OK();
}
virtual size_t GetUniqueId(char* id, size_t max_size) const override {
return file_->GetUniqueId(id, max_size);
}
virtual void Hint(AccessPattern pattern) override { file_->Hint(pattern); }
virtual Status InvalidateCache(size_t offset, size_t length) override {
return file_->InvalidateCache(offset, length);
}
private:
std::unique_ptr<RandomAccessFile> file_;
size_t readahead_size_;
mutable std::mutex lock_;
mutable std::unique_ptr<char[]> buffer_;
mutable uint64_t buffer_offset_;
mutable size_t buffer_len_;
};
} // namespace
std::unique_ptr<RandomAccessFile> NewReadaheadRandomAccessFile(
std::unique_ptr<RandomAccessFile> file, size_t readahead_size) {
std::unique_ptr<ReadaheadRandomAccessFile> wrapped_file(
new ReadaheadRandomAccessFile(std::move(file), readahead_size));
return std::move(wrapped_file);
}
} // namespace rocksdb } // namespace rocksdb

View File

@ -14,6 +14,9 @@ namespace rocksdb {
class Statistics; class Statistics;
class HistogramImpl; class HistogramImpl;
std::unique_ptr<RandomAccessFile> NewReadaheadRandomAccessFile(
std::unique_ptr<RandomAccessFile> file, size_t readahead_size);
class SequentialFileReader { class SequentialFileReader {
private: private:
std::unique_ptr<SequentialFile> file_; std::unique_ptr<SequentialFile> file_;

View File

@ -71,6 +71,7 @@ ImmutableCFOptions::ImmutableCFOptions(const Options& options)
access_hint_on_compaction_start(options.access_hint_on_compaction_start), access_hint_on_compaction_start(options.access_hint_on_compaction_start),
new_table_reader_for_compaction_inputs( new_table_reader_for_compaction_inputs(
options.new_table_reader_for_compaction_inputs), options.new_table_reader_for_compaction_inputs),
compaction_readahead_size(options.compaction_readahead_size),
num_levels(options.num_levels), num_levels(options.num_levels),
optimize_filters_for_hits(options.optimize_filters_for_hits), optimize_filters_for_hits(options.optimize_filters_for_hits),
listeners(options.listeners), listeners(options.listeners),
@ -241,6 +242,7 @@ DBOptions::DBOptions()
db_write_buffer_size(0), db_write_buffer_size(0),
access_hint_on_compaction_start(NORMAL), access_hint_on_compaction_start(NORMAL),
new_table_reader_for_compaction_inputs(false), new_table_reader_for_compaction_inputs(false),
compaction_readahead_size(0),
use_adaptive_mutex(false), use_adaptive_mutex(false),
bytes_per_sync(0), bytes_per_sync(0),
wal_bytes_per_sync(0), wal_bytes_per_sync(0),
@ -294,6 +296,7 @@ DBOptions::DBOptions(const Options& options)
access_hint_on_compaction_start(options.access_hint_on_compaction_start), access_hint_on_compaction_start(options.access_hint_on_compaction_start),
new_table_reader_for_compaction_inputs( new_table_reader_for_compaction_inputs(
options.new_table_reader_for_compaction_inputs), options.new_table_reader_for_compaction_inputs),
compaction_readahead_size(options.compaction_readahead_size),
use_adaptive_mutex(options.use_adaptive_mutex), use_adaptive_mutex(options.use_adaptive_mutex),
bytes_per_sync(options.bytes_per_sync), bytes_per_sync(options.bytes_per_sync),
wal_bytes_per_sync(options.wal_bytes_per_sync), wal_bytes_per_sync(options.wal_bytes_per_sync),
@ -371,6 +374,10 @@ void DBOptions::Dump(Logger* log) const {
access_hints[access_hint_on_compaction_start]); access_hints[access_hint_on_compaction_start]);
Warn(log, " Options.new_table_reader_for_compaction_inputs: %d", Warn(log, " Options.new_table_reader_for_compaction_inputs: %d",
new_table_reader_for_compaction_inputs); new_table_reader_for_compaction_inputs);
Warn(log,
" Options.compaction_readahead_size: %" ROCKSDB_PRIszt
"d",
compaction_readahead_size);
Warn(log, " Options.use_adaptive_mutex: %d", Warn(log, " Options.use_adaptive_mutex: %d",
use_adaptive_mutex); use_adaptive_mutex);
Warn(log, " Options.rate_limiter: %p", Warn(log, " Options.rate_limiter: %p",

View File

@ -85,6 +85,9 @@ static std::unordered_map<std::string, OptionTypeInfo> db_options_type_info = {
{"new_table_reader_for_compaction_inputs", {"new_table_reader_for_compaction_inputs",
{offsetof(struct DBOptions, new_table_reader_for_compaction_inputs), {offsetof(struct DBOptions, new_table_reader_for_compaction_inputs),
OptionType::kBoolean}}, OptionType::kBoolean}},
{"compaction_readahead_size",
{offsetof(struct DBOptions, compaction_readahead_size),
OptionType::kSizeT}},
{"use_adaptive_mutex", {"use_adaptive_mutex",
{offsetof(struct DBOptions, use_adaptive_mutex), OptionType::kBoolean}}, {offsetof(struct DBOptions, use_adaptive_mutex), OptionType::kBoolean}},
{"use_fsync", {"use_fsync",

View File

@ -172,9 +172,9 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
{"advise_random_on_open", "true"}, {"advise_random_on_open", "true"},
{"use_adaptive_mutex", "false"}, {"use_adaptive_mutex", "false"},
{"new_table_reader_for_compaction_inputs", "true"}, {"new_table_reader_for_compaction_inputs", "true"},
{"compaction_readahead_size", "100"},
{"bytes_per_sync", "47"}, {"bytes_per_sync", "47"},
{"wal_bytes_per_sync", "48"}, {"wal_bytes_per_sync", "48"}, };
};
ColumnFamilyOptions base_cf_opt; ColumnFamilyOptions base_cf_opt;
ColumnFamilyOptions new_cf_opt; ColumnFamilyOptions new_cf_opt;
@ -279,6 +279,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
ASSERT_EQ(new_db_opt.advise_random_on_open, true); ASSERT_EQ(new_db_opt.advise_random_on_open, true);
ASSERT_EQ(new_db_opt.use_adaptive_mutex, false); ASSERT_EQ(new_db_opt.use_adaptive_mutex, false);
ASSERT_EQ(new_db_opt.new_table_reader_for_compaction_inputs, true); ASSERT_EQ(new_db_opt.new_table_reader_for_compaction_inputs, true);
ASSERT_EQ(new_db_opt.compaction_readahead_size, 100);
ASSERT_EQ(new_db_opt.bytes_per_sync, static_cast<uint64_t>(47)); ASSERT_EQ(new_db_opt.bytes_per_sync, static_cast<uint64_t>(47));
ASSERT_EQ(new_db_opt.wal_bytes_per_sync, static_cast<uint64_t>(48)); ASSERT_EQ(new_db_opt.wal_bytes_per_sync, static_cast<uint64_t>(48));
} }